From 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 12:05:51 +0200 Subject: Adding upstream version 5.10.209. Signed-off-by: Daniel Baumann --- arch/x86/kernel/.gitignore | 4 + arch/x86/kernel/Makefile | 171 ++ arch/x86/kernel/acpi/Makefile | 12 + arch/x86/kernel/acpi/apei.c | 45 + arch/x86/kernel/acpi/boot.c | 1805 +++++++++++++ arch/x86/kernel/acpi/cppc_msr.c | 49 + arch/x86/kernel/acpi/cstate.c | 229 ++ arch/x86/kernel/acpi/sleep.c | 160 ++ arch/x86/kernel/acpi/sleep.h | 22 + arch/x86/kernel/acpi/wakeup_32.S | 101 + arch/x86/kernel/acpi/wakeup_64.S | 139 + arch/x86/kernel/alternative.c | 1732 ++++++++++++ arch/x86/kernel/amd_gart_64.c | 845 ++++++ arch/x86/kernel/amd_nb.c | 541 ++++ arch/x86/kernel/apb_timer.c | 347 +++ arch/x86/kernel/aperture_64.c | 556 ++++ arch/x86/kernel/apic/Makefile | 30 + arch/x86/kernel/apic/apic.c | 2896 ++++++++++++++++++++ arch/x86/kernel/apic/apic_common.c | 46 + arch/x86/kernel/apic/apic_flat_64.c | 251 ++ arch/x86/kernel/apic/apic_noop.c | 145 + arch/x86/kernel/apic/apic_numachip.c | 338 +++ arch/x86/kernel/apic/bigsmp_32.c | 192 ++ arch/x86/kernel/apic/hw_nmi.c | 59 + arch/x86/kernel/apic/io_apic.c | 3088 +++++++++++++++++++++ arch/x86/kernel/apic/ipi.c | 331 +++ arch/x86/kernel/apic/local.h | 69 + arch/x86/kernel/apic/msi.c | 474 ++++ arch/x86/kernel/apic/probe_32.c | 212 ++ arch/x86/kernel/apic/probe_64.c | 52 + arch/x86/kernel/apic/vector.c | 1278 +++++++++ arch/x86/kernel/apic/x2apic_cluster.c | 228 ++ arch/x86/kernel/apic/x2apic_phys.c | 205 ++ arch/x86/kernel/apic/x2apic_uv_x.c | 1820 +++++++++++++ arch/x86/kernel/apm_32.c | 2432 +++++++++++++++++ arch/x86/kernel/asm-offsets.c | 104 + arch/x86/kernel/asm-offsets_32.c | 63 + arch/x86/kernel/asm-offsets_64.c | 66 + arch/x86/kernel/audit_64.c | 83 + arch/x86/kernel/bootflag.c | 102 + arch/x86/kernel/cc_platform.c | 69 + arch/x86/kernel/check.c | 187 ++ arch/x86/kernel/cpu/.gitignore | 2 + arch/x86/kernel/cpu/Makefile | 67 + arch/x86/kernel/cpu/acrn.c | 64 + arch/x86/kernel/cpu/amd.c | 1346 ++++++++++ arch/x86/kernel/cpu/aperfmperf.c | 137 + arch/x86/kernel/cpu/bugs.c | 2757 +++++++++++++++++++ arch/x86/kernel/cpu/cacheinfo.c | 1035 +++++++ arch/x86/kernel/cpu/centaur.c | 251 ++ arch/x86/kernel/cpu/common.c | 2285 ++++++++++++++++ arch/x86/kernel/cpu/cpu.h | 89 + arch/x86/kernel/cpu/cpuid-deps.c | 133 + arch/x86/kernel/cpu/cyrix.c | 466 ++++ arch/x86/kernel/cpu/feat_ctl.c | 146 + arch/x86/kernel/cpu/hygon.c | 410 +++ arch/x86/kernel/cpu/hypervisor.c | 109 + arch/x86/kernel/cpu/intel.c | 1194 +++++++++ arch/x86/kernel/cpu/intel_epb.c | 216 ++ arch/x86/kernel/cpu/intel_pconfig.c | 82 + arch/x86/kernel/cpu/match.c | 91 + arch/x86/kernel/cpu/mce/Makefile | 16 + arch/x86/kernel/cpu/mce/amd.c | 1541 +++++++++++ arch/x86/kernel/cpu/mce/apei.c | 145 + arch/x86/kernel/cpu/mce/core.c | 2794 +++++++++++++++++++ arch/x86/kernel/cpu/mce/dev-mcelog.c | 374 +++ arch/x86/kernel/cpu/mce/genpool.c | 147 + arch/x86/kernel/cpu/mce/inject.c | 705 +++++ arch/x86/kernel/cpu/mce/intel.c | 543 ++++ arch/x86/kernel/cpu/mce/internal.h | 199 ++ arch/x86/kernel/cpu/mce/p5.c | 70 + arch/x86/kernel/cpu/mce/severity.c | 492 ++++ arch/x86/kernel/cpu/mce/therm_throt.c | 739 +++++ arch/x86/kernel/cpu/mce/threshold.c | 31 + arch/x86/kernel/cpu/mce/winchip.c | 45 + arch/x86/kernel/cpu/microcode/Makefile | 5 + arch/x86/kernel/cpu/microcode/amd.c | 974 +++++++ arch/x86/kernel/cpu/microcode/core.c | 825 ++++++ arch/x86/kernel/cpu/microcode/intel.c | 996 +++++++ arch/x86/kernel/cpu/mkcapflags.sh | 74 + arch/x86/kernel/cpu/mshyperv.c | 397 +++ arch/x86/kernel/cpu/mtrr/Makefile | 4 + arch/x86/kernel/cpu/mtrr/amd.c | 125 + arch/x86/kernel/cpu/mtrr/centaur.c | 127 + arch/x86/kernel/cpu/mtrr/cleanup.c | 987 +++++++ arch/x86/kernel/cpu/mtrr/cyrix.c | 284 ++ arch/x86/kernel/cpu/mtrr/generic.c | 925 +++++++ arch/x86/kernel/cpu/mtrr/if.c | 425 +++ arch/x86/kernel/cpu/mtrr/mtrr.c | 888 ++++++ arch/x86/kernel/cpu/mtrr/mtrr.h | 80 + arch/x86/kernel/cpu/perfctr-watchdog.c | 171 ++ arch/x86/kernel/cpu/powerflags.c | 24 + arch/x86/kernel/cpu/proc.c | 182 ++ arch/x86/kernel/cpu/rdrand.c | 66 + arch/x86/kernel/cpu/resctrl/Makefile | 4 + arch/x86/kernel/cpu/resctrl/core.c | 1047 ++++++++ arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 490 ++++ arch/x86/kernel/cpu/resctrl/internal.h | 632 +++++ arch/x86/kernel/cpu/resctrl/monitor.c | 649 +++++ arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 1594 +++++++++++ arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h | 43 + arch/x86/kernel/cpu/resctrl/rdtgroup.c | 3263 +++++++++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 68 + arch/x86/kernel/cpu/topology.c | 168 ++ arch/x86/kernel/cpu/transmeta.c | 111 + arch/x86/kernel/cpu/tsx.c | 139 + arch/x86/kernel/cpu/umc.c | 26 + arch/x86/kernel/cpu/umwait.c | 238 ++ arch/x86/kernel/cpu/vmware.c | 525 ++++ arch/x86/kernel/cpu/zhaoxin.c | 133 + arch/x86/kernel/cpuid.c | 191 ++ arch/x86/kernel/crash.c | 412 +++ arch/x86/kernel/crash_core_32.c | 17 + arch/x86/kernel/crash_core_64.c | 24 + arch/x86/kernel/crash_dump_32.c | 96 + arch/x86/kernel/crash_dump_64.c | 77 + arch/x86/kernel/devicetree.c | 325 +++ arch/x86/kernel/doublefault_32.c | 133 + arch/x86/kernel/dumpstack.c | 479 ++++ arch/x86/kernel/dumpstack_32.c | 155 ++ arch/x86/kernel/dumpstack_64.c | 205 ++ arch/x86/kernel/e820.c | 1350 ++++++++++ arch/x86/kernel/early-quirks.c | 806 ++++++ arch/x86/kernel/early_printk.c | 398 +++ arch/x86/kernel/ebda.c | 98 + arch/x86/kernel/eisa.c | 24 + arch/x86/kernel/espfix_64.c | 207 ++ arch/x86/kernel/fpu/Makefile | 6 + arch/x86/kernel/fpu/bugs.c | 59 + arch/x86/kernel/fpu/core.c | 490 ++++ arch/x86/kernel/fpu/init.c | 256 ++ arch/x86/kernel/fpu/regset.c | 350 +++ arch/x86/kernel/fpu/signal.c | 540 ++++ arch/x86/kernel/fpu/xstate.c | 1464 ++++++++++ arch/x86/kernel/ftrace.c | 689 +++++ arch/x86/kernel/ftrace_32.S | 193 ++ arch/x86/kernel/ftrace_64.S | 354 +++ arch/x86/kernel/head32.c | 119 + arch/x86/kernel/head64.c | 607 +++++ arch/x86/kernel/head_32.S | 597 +++++ arch/x86/kernel/head_64.S | 603 +++++ arch/x86/kernel/hpet.c | 1361 ++++++++++ arch/x86/kernel/hw_breakpoint.c | 592 ++++ arch/x86/kernel/i8237.c | 76 + arch/x86/kernel/i8253.c | 67 + arch/x86/kernel/i8259.c | 457 ++++ arch/x86/kernel/idt.c | 353 +++ arch/x86/kernel/ima_arch.c | 96 + arch/x86/kernel/io_delay.c | 148 + arch/x86/kernel/ioport.c | 215 ++ arch/x86/kernel/irq.c | 378 +++ arch/x86/kernel/irq_32.c | 160 ++ arch/x86/kernel/irq_64.c | 78 + arch/x86/kernel/irq_work.c | 34 + arch/x86/kernel/irqflags.S | 26 + arch/x86/kernel/irqinit.c | 107 + arch/x86/kernel/itmt.c | 205 ++ arch/x86/kernel/jailhouse.c | 295 ++ arch/x86/kernel/jump_label.c | 157 ++ arch/x86/kernel/kdebugfs.c | 195 ++ arch/x86/kernel/kexec-bzimage64.c | 555 ++++ arch/x86/kernel/kgdb.c | 785 ++++++ arch/x86/kernel/kprobes/Makefile | 8 + arch/x86/kernel/kprobes/common.h | 108 + arch/x86/kernel/kprobes/core.c | 1152 ++++++++ arch/x86/kernel/kprobes/ftrace.c | 63 + arch/x86/kernel/kprobes/opt.c | 556 ++++ arch/x86/kernel/ksysfs.c | 401 +++ arch/x86/kernel/kvm.c | 1054 ++++++++ arch/x86/kernel/kvmclock.c | 347 +++ arch/x86/kernel/ldt.c | 688 +++++ arch/x86/kernel/machine_kexec_32.c | 251 ++ arch/x86/kernel/machine_kexec_64.c | 622 +++++ arch/x86/kernel/mmconf-fam10h_64.c | 238 ++ arch/x86/kernel/module.c | 316 +++ arch/x86/kernel/mpparse.c | 944 +++++++ arch/x86/kernel/msr.c | 335 +++ arch/x86/kernel/nmi.c | 552 ++++ arch/x86/kernel/nmi_selftest.c | 184 ++ arch/x86/kernel/paravirt-spinlocks.c | 34 + arch/x86/kernel/paravirt.c | 424 +++ arch/x86/kernel/paravirt_patch.c | 106 + arch/x86/kernel/pci-dma.c | 156 ++ arch/x86/kernel/pci-iommu_table.c | 80 + arch/x86/kernel/pci-swiotlb.c | 78 + arch/x86/kernel/pcspeaker.c | 14 + arch/x86/kernel/perf_regs.c | 202 ++ arch/x86/kernel/platform-quirks.c | 46 + arch/x86/kernel/pmem.c | 37 + arch/x86/kernel/probe_roms.c | 269 ++ arch/x86/kernel/process.c | 1005 +++++++ arch/x86/kernel/process.h | 39 + arch/x86/kernel/process_32.c | 225 ++ arch/x86/kernel/process_64.c | 837 ++++++ arch/x86/kernel/ptrace.c | 1340 ++++++++++ arch/x86/kernel/pvclock.c | 156 ++ arch/x86/kernel/quirks.c | 669 +++++ arch/x86/kernel/reboot.c | 943 +++++++ arch/x86/kernel/reboot_fixups_32.c | 103 + arch/x86/kernel/relocate_kernel_32.S | 291 ++ arch/x86/kernel/relocate_kernel_64.S | 306 +++ arch/x86/kernel/resource.c | 51 + arch/x86/kernel/rtc.c | 207 ++ arch/x86/kernel/setup.c | 1278 +++++++++ arch/x86/kernel/setup_percpu.c | 298 +++ arch/x86/kernel/sev-es-shared.c | 567 ++++ arch/x86/kernel/sev-es.c | 1507 +++++++++++ arch/x86/kernel/sev_verify_cbit.S | 89 + arch/x86/kernel/signal.c | 884 ++++++ arch/x86/kernel/signal_compat.c | 182 ++ arch/x86/kernel/smp.c | 279 ++ arch/x86/kernel/smpboot.c | 2122 +++++++++++++++ arch/x86/kernel/stacktrace.c | 136 + arch/x86/kernel/static_call.c | 151 ++ arch/x86/kernel/step.c | 236 ++ arch/x86/kernel/sys_ia32.c | 256 ++ arch/x86/kernel/sys_x86_64.c | 232 ++ arch/x86/kernel/sysfb.c | 70 + arch/x86/kernel/sysfb_efi.c | 300 +++ arch/x86/kernel/sysfb_simplefb.c | 111 + arch/x86/kernel/tboot.c | 525 ++++ arch/x86/kernel/time.c | 129 + arch/x86/kernel/tls.c | 301 +++ arch/x86/kernel/tls.h | 18 + arch/x86/kernel/topology.c | 166 ++ arch/x86/kernel/trace_clock.c | 17 + arch/x86/kernel/tracepoint.c | 27 + arch/x86/kernel/traps.c | 1189 +++++++++ arch/x86/kernel/tsc.c | 1576 +++++++++++ arch/x86/kernel/tsc_msr.c | 236 ++ arch/x86/kernel/tsc_sync.c | 534 ++++ arch/x86/kernel/umip.c | 413 +++ arch/x86/kernel/unwind_frame.c | 409 +++ arch/x86/kernel/unwind_guess.c | 73 + arch/x86/kernel/unwind_orc.c | 710 +++++ arch/x86/kernel/uprobes.c | 1097 ++++++++ arch/x86/kernel/verify_cpu.S | 140 + arch/x86/kernel/vm86_32.c | 866 ++++++ arch/x86/kernel/vmlinux.lds.S | 549 ++++ arch/x86/kernel/vsmp_64.c | 153 ++ arch/x86/kernel/x86_init.c | 162 ++ 241 files changed, 108077 insertions(+) create mode 100644 arch/x86/kernel/.gitignore create mode 100644 arch/x86/kernel/Makefile create mode 100644 arch/x86/kernel/acpi/Makefile create mode 100644 arch/x86/kernel/acpi/apei.c create mode 100644 arch/x86/kernel/acpi/boot.c create mode 100644 arch/x86/kernel/acpi/cppc_msr.c create mode 100644 arch/x86/kernel/acpi/cstate.c create mode 100644 arch/x86/kernel/acpi/sleep.c create mode 100644 arch/x86/kernel/acpi/sleep.h create mode 100644 arch/x86/kernel/acpi/wakeup_32.S create mode 100644 arch/x86/kernel/acpi/wakeup_64.S create mode 100644 arch/x86/kernel/alternative.c create mode 100644 arch/x86/kernel/amd_gart_64.c create mode 100644 arch/x86/kernel/amd_nb.c create mode 100644 arch/x86/kernel/apb_timer.c create mode 100644 arch/x86/kernel/aperture_64.c create mode 100644 arch/x86/kernel/apic/Makefile create mode 100644 arch/x86/kernel/apic/apic.c create mode 100644 arch/x86/kernel/apic/apic_common.c create mode 100644 arch/x86/kernel/apic/apic_flat_64.c create mode 100644 arch/x86/kernel/apic/apic_noop.c create mode 100644 arch/x86/kernel/apic/apic_numachip.c create mode 100644 arch/x86/kernel/apic/bigsmp_32.c create mode 100644 arch/x86/kernel/apic/hw_nmi.c create mode 100644 arch/x86/kernel/apic/io_apic.c create mode 100644 arch/x86/kernel/apic/ipi.c create mode 100644 arch/x86/kernel/apic/local.h create mode 100644 arch/x86/kernel/apic/msi.c create mode 100644 arch/x86/kernel/apic/probe_32.c create mode 100644 arch/x86/kernel/apic/probe_64.c create mode 100644 arch/x86/kernel/apic/vector.c create mode 100644 arch/x86/kernel/apic/x2apic_cluster.c create mode 100644 arch/x86/kernel/apic/x2apic_phys.c create mode 100644 arch/x86/kernel/apic/x2apic_uv_x.c create mode 100644 arch/x86/kernel/apm_32.c create mode 100644 arch/x86/kernel/asm-offsets.c create mode 100644 arch/x86/kernel/asm-offsets_32.c create mode 100644 arch/x86/kernel/asm-offsets_64.c create mode 100644 arch/x86/kernel/audit_64.c create mode 100644 arch/x86/kernel/bootflag.c create mode 100644 arch/x86/kernel/cc_platform.c create mode 100644 arch/x86/kernel/check.c create mode 100644 arch/x86/kernel/cpu/.gitignore create mode 100644 arch/x86/kernel/cpu/Makefile create mode 100644 arch/x86/kernel/cpu/acrn.c create mode 100644 arch/x86/kernel/cpu/amd.c create mode 100644 arch/x86/kernel/cpu/aperfmperf.c create mode 100644 arch/x86/kernel/cpu/bugs.c create mode 100644 arch/x86/kernel/cpu/cacheinfo.c create mode 100644 arch/x86/kernel/cpu/centaur.c create mode 100644 arch/x86/kernel/cpu/common.c create mode 100644 arch/x86/kernel/cpu/cpu.h create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c create mode 100644 arch/x86/kernel/cpu/cyrix.c create mode 100644 arch/x86/kernel/cpu/feat_ctl.c create mode 100644 arch/x86/kernel/cpu/hygon.c create mode 100644 arch/x86/kernel/cpu/hypervisor.c create mode 100644 arch/x86/kernel/cpu/intel.c create mode 100644 arch/x86/kernel/cpu/intel_epb.c create mode 100644 arch/x86/kernel/cpu/intel_pconfig.c create mode 100644 arch/x86/kernel/cpu/match.c create mode 100644 arch/x86/kernel/cpu/mce/Makefile create mode 100644 arch/x86/kernel/cpu/mce/amd.c create mode 100644 arch/x86/kernel/cpu/mce/apei.c create mode 100644 arch/x86/kernel/cpu/mce/core.c create mode 100644 arch/x86/kernel/cpu/mce/dev-mcelog.c create mode 100644 arch/x86/kernel/cpu/mce/genpool.c create mode 100644 arch/x86/kernel/cpu/mce/inject.c create mode 100644 arch/x86/kernel/cpu/mce/intel.c create mode 100644 arch/x86/kernel/cpu/mce/internal.h create mode 100644 arch/x86/kernel/cpu/mce/p5.c create mode 100644 arch/x86/kernel/cpu/mce/severity.c create mode 100644 arch/x86/kernel/cpu/mce/therm_throt.c create mode 100644 arch/x86/kernel/cpu/mce/threshold.c create mode 100644 arch/x86/kernel/cpu/mce/winchip.c create mode 100644 arch/x86/kernel/cpu/microcode/Makefile create mode 100644 arch/x86/kernel/cpu/microcode/amd.c create mode 100644 arch/x86/kernel/cpu/microcode/core.c create mode 100644 arch/x86/kernel/cpu/microcode/intel.c create mode 100644 arch/x86/kernel/cpu/mkcapflags.sh create mode 100644 arch/x86/kernel/cpu/mshyperv.c create mode 100644 arch/x86/kernel/cpu/mtrr/Makefile create mode 100644 arch/x86/kernel/cpu/mtrr/amd.c create mode 100644 arch/x86/kernel/cpu/mtrr/centaur.c create mode 100644 arch/x86/kernel/cpu/mtrr/cleanup.c create mode 100644 arch/x86/kernel/cpu/mtrr/cyrix.c create mode 100644 arch/x86/kernel/cpu/mtrr/generic.c create mode 100644 arch/x86/kernel/cpu/mtrr/if.c create mode 100644 arch/x86/kernel/cpu/mtrr/mtrr.c create mode 100644 arch/x86/kernel/cpu/mtrr/mtrr.h create mode 100644 arch/x86/kernel/cpu/perfctr-watchdog.c create mode 100644 arch/x86/kernel/cpu/powerflags.c create mode 100644 arch/x86/kernel/cpu/proc.c create mode 100644 arch/x86/kernel/cpu/rdrand.c create mode 100644 arch/x86/kernel/cpu/resctrl/Makefile create mode 100644 arch/x86/kernel/cpu/resctrl/core.c create mode 100644 arch/x86/kernel/cpu/resctrl/ctrlmondata.c create mode 100644 arch/x86/kernel/cpu/resctrl/internal.h create mode 100644 arch/x86/kernel/cpu/resctrl/monitor.c create mode 100644 arch/x86/kernel/cpu/resctrl/pseudo_lock.c create mode 100644 arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h create mode 100644 arch/x86/kernel/cpu/resctrl/rdtgroup.c create mode 100644 arch/x86/kernel/cpu/scattered.c create mode 100644 arch/x86/kernel/cpu/topology.c create mode 100644 arch/x86/kernel/cpu/transmeta.c create mode 100644 arch/x86/kernel/cpu/tsx.c create mode 100644 arch/x86/kernel/cpu/umc.c create mode 100644 arch/x86/kernel/cpu/umwait.c create mode 100644 arch/x86/kernel/cpu/vmware.c create mode 100644 arch/x86/kernel/cpu/zhaoxin.c create mode 100644 arch/x86/kernel/cpuid.c create mode 100644 arch/x86/kernel/crash.c create mode 100644 arch/x86/kernel/crash_core_32.c create mode 100644 arch/x86/kernel/crash_core_64.c create mode 100644 arch/x86/kernel/crash_dump_32.c create mode 100644 arch/x86/kernel/crash_dump_64.c create mode 100644 arch/x86/kernel/devicetree.c create mode 100644 arch/x86/kernel/doublefault_32.c create mode 100644 arch/x86/kernel/dumpstack.c create mode 100644 arch/x86/kernel/dumpstack_32.c create mode 100644 arch/x86/kernel/dumpstack_64.c create mode 100644 arch/x86/kernel/e820.c create mode 100644 arch/x86/kernel/early-quirks.c create mode 100644 arch/x86/kernel/early_printk.c create mode 100644 arch/x86/kernel/ebda.c create mode 100644 arch/x86/kernel/eisa.c create mode 100644 arch/x86/kernel/espfix_64.c create mode 100644 arch/x86/kernel/fpu/Makefile create mode 100644 arch/x86/kernel/fpu/bugs.c create mode 100644 arch/x86/kernel/fpu/core.c create mode 100644 arch/x86/kernel/fpu/init.c create mode 100644 arch/x86/kernel/fpu/regset.c create mode 100644 arch/x86/kernel/fpu/signal.c create mode 100644 arch/x86/kernel/fpu/xstate.c create mode 100644 arch/x86/kernel/ftrace.c create mode 100644 arch/x86/kernel/ftrace_32.S create mode 100644 arch/x86/kernel/ftrace_64.S create mode 100644 arch/x86/kernel/head32.c create mode 100644 arch/x86/kernel/head64.c create mode 100644 arch/x86/kernel/head_32.S create mode 100644 arch/x86/kernel/head_64.S create mode 100644 arch/x86/kernel/hpet.c create mode 100644 arch/x86/kernel/hw_breakpoint.c create mode 100644 arch/x86/kernel/i8237.c create mode 100644 arch/x86/kernel/i8253.c create mode 100644 arch/x86/kernel/i8259.c create mode 100644 arch/x86/kernel/idt.c create mode 100644 arch/x86/kernel/ima_arch.c create mode 100644 arch/x86/kernel/io_delay.c create mode 100644 arch/x86/kernel/ioport.c create mode 100644 arch/x86/kernel/irq.c create mode 100644 arch/x86/kernel/irq_32.c create mode 100644 arch/x86/kernel/irq_64.c create mode 100644 arch/x86/kernel/irq_work.c create mode 100644 arch/x86/kernel/irqflags.S create mode 100644 arch/x86/kernel/irqinit.c create mode 100644 arch/x86/kernel/itmt.c create mode 100644 arch/x86/kernel/jailhouse.c create mode 100644 arch/x86/kernel/jump_label.c create mode 100644 arch/x86/kernel/kdebugfs.c create mode 100644 arch/x86/kernel/kexec-bzimage64.c create mode 100644 arch/x86/kernel/kgdb.c create mode 100644 arch/x86/kernel/kprobes/Makefile create mode 100644 arch/x86/kernel/kprobes/common.h create mode 100644 arch/x86/kernel/kprobes/core.c create mode 100644 arch/x86/kernel/kprobes/ftrace.c create mode 100644 arch/x86/kernel/kprobes/opt.c create mode 100644 arch/x86/kernel/ksysfs.c create mode 100644 arch/x86/kernel/kvm.c create mode 100644 arch/x86/kernel/kvmclock.c create mode 100644 arch/x86/kernel/ldt.c create mode 100644 arch/x86/kernel/machine_kexec_32.c create mode 100644 arch/x86/kernel/machine_kexec_64.c create mode 100644 arch/x86/kernel/mmconf-fam10h_64.c create mode 100644 arch/x86/kernel/module.c create mode 100644 arch/x86/kernel/mpparse.c create mode 100644 arch/x86/kernel/msr.c create mode 100644 arch/x86/kernel/nmi.c create mode 100644 arch/x86/kernel/nmi_selftest.c create mode 100644 arch/x86/kernel/paravirt-spinlocks.c create mode 100644 arch/x86/kernel/paravirt.c create mode 100644 arch/x86/kernel/paravirt_patch.c create mode 100644 arch/x86/kernel/pci-dma.c create mode 100644 arch/x86/kernel/pci-iommu_table.c create mode 100644 arch/x86/kernel/pci-swiotlb.c create mode 100644 arch/x86/kernel/pcspeaker.c create mode 100644 arch/x86/kernel/perf_regs.c create mode 100644 arch/x86/kernel/platform-quirks.c create mode 100644 arch/x86/kernel/pmem.c create mode 100644 arch/x86/kernel/probe_roms.c create mode 100644 arch/x86/kernel/process.c create mode 100644 arch/x86/kernel/process.h create mode 100644 arch/x86/kernel/process_32.c create mode 100644 arch/x86/kernel/process_64.c create mode 100644 arch/x86/kernel/ptrace.c create mode 100644 arch/x86/kernel/pvclock.c create mode 100644 arch/x86/kernel/quirks.c create mode 100644 arch/x86/kernel/reboot.c create mode 100644 arch/x86/kernel/reboot_fixups_32.c create mode 100644 arch/x86/kernel/relocate_kernel_32.S create mode 100644 arch/x86/kernel/relocate_kernel_64.S create mode 100644 arch/x86/kernel/resource.c create mode 100644 arch/x86/kernel/rtc.c create mode 100644 arch/x86/kernel/setup.c create mode 100644 arch/x86/kernel/setup_percpu.c create mode 100644 arch/x86/kernel/sev-es-shared.c create mode 100644 arch/x86/kernel/sev-es.c create mode 100644 arch/x86/kernel/sev_verify_cbit.S create mode 100644 arch/x86/kernel/signal.c create mode 100644 arch/x86/kernel/signal_compat.c create mode 100644 arch/x86/kernel/smp.c create mode 100644 arch/x86/kernel/smpboot.c create mode 100644 arch/x86/kernel/stacktrace.c create mode 100644 arch/x86/kernel/static_call.c create mode 100644 arch/x86/kernel/step.c create mode 100644 arch/x86/kernel/sys_ia32.c create mode 100644 arch/x86/kernel/sys_x86_64.c create mode 100644 arch/x86/kernel/sysfb.c create mode 100644 arch/x86/kernel/sysfb_efi.c create mode 100644 arch/x86/kernel/sysfb_simplefb.c create mode 100644 arch/x86/kernel/tboot.c create mode 100644 arch/x86/kernel/time.c create mode 100644 arch/x86/kernel/tls.c create mode 100644 arch/x86/kernel/tls.h create mode 100644 arch/x86/kernel/topology.c create mode 100644 arch/x86/kernel/trace_clock.c create mode 100644 arch/x86/kernel/tracepoint.c create mode 100644 arch/x86/kernel/traps.c create mode 100644 arch/x86/kernel/tsc.c create mode 100644 arch/x86/kernel/tsc_msr.c create mode 100644 arch/x86/kernel/tsc_sync.c create mode 100644 arch/x86/kernel/umip.c create mode 100644 arch/x86/kernel/unwind_frame.c create mode 100644 arch/x86/kernel/unwind_guess.c create mode 100644 arch/x86/kernel/unwind_orc.c create mode 100644 arch/x86/kernel/uprobes.c create mode 100644 arch/x86/kernel/verify_cpu.S create mode 100644 arch/x86/kernel/vm86_32.c create mode 100644 arch/x86/kernel/vmlinux.lds.S create mode 100644 arch/x86/kernel/vsmp_64.c create mode 100644 arch/x86/kernel/x86_init.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore new file mode 100644 index 000000000..ef66569e7 --- /dev/null +++ b/arch/x86/kernel/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +vsyscall.lds +vsyscall_32.lds +vmlinux.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile new file mode 100644 index 000000000..c06f3a961 --- /dev/null +++ b/arch/x86/kernel/Makefile @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += ebda.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) + +ifdef CONFIG_FUNCTION_TRACER +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_tsc.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg +CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev-es.o = -pg +CFLAGS_REMOVE_cc_platform.o = -pg +endif + +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n +KASAN_SANITIZE_sev-es.o := n +KASAN_SANITIZE_cc_platform.o := n + +# With some compiler versions the generated code results in boot hangs, caused +# by several compilation units. To be safe, disable all instrumentation. +KCSAN_SANITIZE := n + +OBJECT_FILES_NON_STANDARD_test_nx.o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y + +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +endif + +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n + +CFLAGS_head$(BITS).o += -fno-stack-protector +CFLAGS_cc_platform.o += -fno-stack-protector + +CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace + +obj-y := process_$(BITS).o signal.o +obj-$(CONFIG_COMPAT) += signal_compat.o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o +obj-y += bootflag.o e820.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o hw_breakpoint.o +obj-y += tsc.o tsc_msr.o io_delay.o rtc.o +obj-y += pci-iommu_table.o +obj-y += resource.o +obj-y += irqflags.o +obj-y += static_call.o + +obj-y += process.o +obj-y += fpu/ +obj-y += ptrace.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-y += reboot.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_PCI) += early-quirks.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o +obj-$(CONFIG_SMP) += setup_percpu.o +obj-$(CONFIG_X86_MPPARSE) += mpparse.o +obj-y += apic/ +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-y += kprobes/ +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_X86_32) += doublefault_32.o +obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o + +obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o + +obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o + +obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o + +obj-$(CONFIG_EISA) += eisa.o +obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o + +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + +obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o + +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o +obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_UMIP) += umip.o + +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o + +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o + +obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_AUDIT) += audit_64.o + + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o + + obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o + obj-y += vsmp_64.o +endif + +obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile new file mode 100644 index 000000000..f1bb57b0e --- /dev/null +++ b/arch/x86/kernel/acpi/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y + +obj-$(CONFIG_ACPI) += boot.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o + +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += cstate.o +endif + diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 000000000..c22fb55ab --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Arch-specific APEI-related functions. + */ + +#include + +#include +#include + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c new file mode 100644 index 000000000..c91851573 --- /dev/null +++ b/arch/x86/kernel/acpi/boot.c @@ -0,0 +1,1805 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2001 Jun Nakajima + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ +static int __initdata acpi_force = 0; +int acpi_disabled; +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_X86_64 +# include +#endif /* X86 */ + +#define PREFIX "ACPI: " + +int acpi_noirq; /* skip ACPI IRQ initialization */ +static int acpi_nobgrt; /* skip ACPI BGRT */ +int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +int acpi_disable_cmcff; + +/* ACPI SCI override configuration */ +u8 acpi_sci_flags __initdata; +u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ; +int acpi_skip_timer_override __initdata; +int acpi_use_timer_override __initdata; +int acpi_fix_pin2_polarity __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +#endif + +#ifdef CONFIG_X86_IO_APIC +/* + * Locks related to IOAPIC hotplug + * Hotplug side: + * ->device_hotplug_lock + * ->acpi_ioapic_lock + * ->ioapic_lock + * Interrupt mapping side: + * ->acpi_ioapic_lock + * ->ioapic_mutex + * ->ioapic_lock + */ +static DEFINE_MUTEX(acpi_ioapic_lock); +#endif + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overridden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + + +/* + * ISA irqs by default are the first 16 gsis but can be + * any gsi as specified by an interrupt source override. + */ +static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * This is just a simple wrapper around early_memremap(), + * with sanity checks for phys == 0 and size == 0. + */ +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) +{ + + if (!phys || !size) + return NULL; + + return early_memremap(phys, size); +} + +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) +{ + if (!map || !size) + return; + + early_memunmap(map, size); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init acpi_parse_madt(struct acpi_table_header *table) +{ + struct acpi_table_madt *madt = NULL; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -EINVAL; + + madt = (struct acpi_table_madt *)table; + if (!madt) { + printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->address) { + acpi_lapic_addr = (u64) madt->address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->address); + } + + if (madt->flags & ACPI_MADT_PCAT_COMPAT) + legacy_pic_pcat_compat(); + + default_acpi_madt_oem_check(madt->header.oem_id, + madt->header.oem_table_id); + + return 0; +} + +/** + * acpi_register_lapic - register a local apic and generates a logic cpu number + * @id: local apic id to register + * @acpiid: ACPI id to register + * @enabled: this cpu is enabled or not + * + * Returns the logic cpu number which maps to the local apic + */ +static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) +{ + unsigned int ver = 0; + int cpu; + + if (id >= MAX_LOCAL_APIC) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return -EINVAL; + } + + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + + if (boot_cpu_physical_apicid != -1U) + ver = boot_cpu_apic_version; + + cpu = generic_processor_info(id, ver); + if (cpu >= 0) + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; + + return cpu; +} + +static int __init +acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC + u32 apic_id; + u8 enabled; +#endif + + processor = (struct acpi_madt_local_x2apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + +#ifdef CONFIG_X86_X2APIC + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; + + /* Ignore invalid ID */ + if (apic_id == 0xffffffff) + return 0; + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + if (!apic->apic_id_valid(apic_id)) { + if (enabled) + pr_warn(PREFIX "x2apic entry ignored\n"); + return 0; + } + + acpi_register_lapic(apic_id, processor->uid, enabled); +#else + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); +#endif + + return 0; +} + +static int __init +acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + acpi_register_lapic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_sapic *processor = NULL; + + processor = (struct acpi_madt_local_sapic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL; + + x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header; + + if (BAD_MADT_ENTRY(x2apic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (x2apic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + +static int __init +acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_madt_local_apic_nmi *)header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (lapic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + +#endif /*CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi); + +static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, + u32 gsi) +{ + /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0) + return; + /* + * Reset default identity mapping if gsi is also an legacy IRQ, + * otherwise there will be more than one entry with the same GSI + * and acpi_isa_irq_to_gsi() may give wrong result. + */ + if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) + isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ; + isa_irq_to_gsi[bus_irq] = gsi; +} + +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev || !dev_is_pci(dev)) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + mp_save_irq(&mp_irq); +#endif + return 0; +} + +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi) +{ + struct mpc_intsrc mp_irq; + int ioapic, pin; + + /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + pr_warn("Failed to find ioapic for gsi : %u\n", gsi); + return ioapic; + } + + pin = mp_find_ioapic_pin(ioapic, gsi); + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + + return 0; +} + +static int __init +acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_io_apic *ioapic = NULL; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &mp_ioapic_irqdomain_ops, + }; + + ioapic = (struct acpi_madt_io_apic *)header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ + if (ioapic->global_irq_base < nr_legacy_irqs()) + cfg.type = IOAPIC_DOMAIN_LEGACY; + + mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base, + &cfg); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) + trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2; + + if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) + polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; + + if (bus_irq < NR_IRQS_LEGACY) + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + else + mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi); + + acpi_penalize_sci_irq(bus_irq, trigger, polarity); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_gbl_FADT + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_madt_interrupt_override *intsrc = NULL; + + intsrc = (struct acpi_madt_interrupt_override *)header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { + acpi_sci_ioapic_setup(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + return 0; + } + + if (intsrc->source_irq == 0) { + if (acpi_skip_timer_override) { + printk(PREFIX "BIOS IRQ0 override ignored.\n"); + return 0; + } + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } + } + + mp_override_legacy_irq(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + + return 0; +} + +static int __init +acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end) +{ + struct acpi_madt_nmi_source *nmi_src = NULL; + + nmi_src = (struct acpi_madt_nmi_source *)header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + */ + +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(0x4d0) | (inb(0x4d1) << 8); + + /* + * If we use ACPI to set PCI IRQs, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + outb(new, 0x4d0); + outb(new >> 8, 0x4d1); +} + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) +{ + int rc, irq, trigger, polarity; + + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + + rc = acpi_get_override_irq(gsi, &trigger, &polarity); + if (rc) + return rc; + + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq < 0) + return irq; + + *irqp = irq; + return 0; +} +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); + +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq < nr_legacy_irqs() && + isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) { + *gsi = isa_irq_to_gsi[isa_irq]; + return 0; + } + + return -1; +} + +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ +#ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. + */ + if (trigger == ACPI_LEVEL_SENSITIVE) + elcr_set_level_irq(gsi); +#endif + + return gsi; +} + +#ifdef CONFIG_X86_LOCAL_APIC +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + int irq = gsi; +#ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + + mutex_lock(&acpi_ioapic_lock); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return irq; +} + +static void acpi_unregister_gsi_ioapic(u32 gsi) +{ +#ifdef CONFIG_X86_IO_APIC + int irq; + + mutex_lock(&acpi_ioapic_lock); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); + mutex_unlock(&acpi_ioapic_lock); +#endif +} +#endif + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; +void (*__acpi_unregister_gsi)(u32 gsi) = NULL; + +#ifdef CONFIG_ACPI_SLEEP +int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; +#else +int (*acpi_suspend_lowlevel)(void); +#endif + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + return __acpi_register_gsi(dev, gsi, trigger, polarity); +} +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ + if (__acpi_unregister_gsi) + __acpi_unregister_gsi(gsi); +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); + +#ifdef CONFIG_X86_LOCAL_APIC +static void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + __acpi_unregister_gsi = acpi_unregister_gsi_ioapic; + acpi_ioapic = 1; +} +#endif + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +#include + +static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid != NUMA_NO_NODE) { + set_apicid_to_node(physid, nid); + numa_set_node(cpu, nid); + } +#endif + return 0; +} + +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, + int *pcpu) +{ + int cpu; + + cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); + if (cpu < 0) { + pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); + return cpu; + } + + acpi_processor_set_pdc(handle); + acpi_map_cpu2node(handle, cpu, physid); + + *pcpu = cpu; + return 0; +} +EXPORT_SYMBOL(acpi_map_cpu); + +int acpi_unmap_cpu(int cpu) +{ +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + + per_cpu(x86_cpu_to_apicid, cpu) = -1; + set_cpu_present(cpu, false); + num_processors--; + + return (0); +} +EXPORT_SYMBOL(acpi_unmap_cpu); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + int ret = -ENOSYS; +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + int ioapic_id; + u64 addr; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &mp_ioapic_irqdomain_ops, + }; + + ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); + if (ioapic_id < 0) { + unsigned long long uid; + acpi_status status; + + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, + NULL, &uid); + if (ACPI_FAILURE(status)) { + acpi_handle_warn(handle, "failed to get IOAPIC ID.\n"); + return -EINVAL; + } + ioapic_id = (int)uid; + } + + mutex_lock(&acpi_ioapic_lock); + ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} +EXPORT_SYMBOL(acpi_register_ioapic); + +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + int ret = -ENOSYS; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_unregister_ioapic(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} +EXPORT_SYMBOL(acpi_unregister_ioapic); + +/** + * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * has been registered + * @handle: ACPI handle of the IOAPIC device + * @gsi_base: GSI base associated with the IOAPIC + * + * Assume caller holds some type of lock to serialize acpi_ioapic_registered() + * with acpi_register_ioapic()/acpi_unregister_ioapic(). + */ +int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) +{ + int ret = 0; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_ioapic_registered(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} + +static int __init acpi_parse_sbf(struct acpi_table_header *table) +{ + struct acpi_table_boot *sb = (struct acpi_table_boot *)table; + + sbf_port = sb->cmos_index; /* Save CMOS port */ + + return 0; +} + +#ifdef CONFIG_HPET_TIMER +#include + +static struct resource *hpet_res __initdata; + +static int __init acpi_parse_hpet(struct acpi_table_header *table) +{ + struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; + + if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { + printk(KERN_WARNING PREFIX "HPET timers must be located in " + "memory.\n"); + return -1; + } + + hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; + + /* + * Some broken BIOSes advertise HPET at 0x0. We really do not + * want to allocate a resource there. + */ + if (!hpet_address) { + printk(KERN_WARNING PREFIX + "HPET id: %#x base: %#lx is invalid\n", + hpet_tbl->id, hpet_address); + return 0; + } +#ifdef CONFIG_X86_64 + /* + * Some even more broken BIOSes advertise HPET at + * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add + * some noise: + */ + if (hpet_address == 0xfed0000000000000UL) { + if (!hpet_force_user) { + printk(KERN_WARNING PREFIX "HPET id: %#x " + "base: 0xfed0000000000000 is bogus\n " + "try hpet=force on the kernel command line to " + "fix it up to 0xfed00000.\n", hpet_tbl->id); + hpet_address = 0; + return 0; + } + printk(KERN_WARNING PREFIX + "HPET id: %#x base: 0xfed0000000000000 fixed up " + "to 0xfed00000.\n", hpet_tbl->id); + hpet_address >>= 32; + } +#endif + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + SMP_CACHE_BYTES); + if (!hpet_res) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + + return 0; +} + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + +#else +#define acpi_parse_hpet NULL +#endif + +static int __init acpi_parse_fadt(struct acpi_table_header *table) +{ + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && + x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { + pr_debug("ACPI: i8042 controller is absent\n"); + x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { + pr_debug("ACPI: probing for VGA not safe\n"); + x86_platform.legacy.no_vga = 1; + } + +#ifdef CONFIG_X86_PM_TIMER + /* detect the location of the ACPI PM Timer */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (acpi_gbl_FADT.xpm_timer_block.space_id != + ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address; + /* + * "X" fields are optional extensions to the original V1.0 + * fields, so we must selectively expand V1.0 fields if the + * corresponding X field is zero. + */ + if (!pmtmr_ioport) + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } + if (pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", + pmtmr_ioport); +#endif + return 0; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ + +static int __init early_acpi_parse_madt_lapic_addr_ovr(void) +{ + int count; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + return count; + } + + register_lapic_address(acpi_lapic_addr); + + return count; +} + +static int __init acpi_parse_madt_lapic_entries(void) +{ + int count; + int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, + acpi_parse_sapic, MAX_LOCAL_APIC); + + if (!count) { + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + count = madt_proc[0].count; + x2count = madt_proc[1].count; + } + if (!count && !x2count) { + printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } else if (count < 0 || x2count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, + acpi_parse_x2apic_nmi, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, + acpi_parse_lapic_nmi, 0); + if (count < 0 || x2count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static void __init mp_config_acpi_legacy_irqs(void) +{ + int i; + struct mpc_intsrc mp_irq; + +#ifdef CONFIG_EISA + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + int ioapic, pin; + unsigned int dstapic; + int idx; + u32 gsi; + + /* Locate the gsi that irq i maps to. */ + if (acpi_isa_irq_to_gsi(i, &gsi)) + continue; + + /* + * Locate the IOAPIC that manages the ISA IRQ. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + continue; + pin = mp_find_ioapic_pin(ioapic, gsi); + dstapic = mpc_ioapic_id(ioapic); + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if (irq->dstapic == dstapic && irq->dstirq == pin) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + mp_irq.type = MP_INTSRC; + mp_irq.irqflag = 0; /* Conforming */ + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.dstapic = dstapic; + mp_irq.irqtype = mp_INT; + mp_irq.srcbusirq = i; /* Identity mapped */ + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + } +} + +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return -ENODEV; + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (skip_ioapic_setup) { + printk(KERN_INFO PREFIX "Skipping IOAPIC probe " + "due to 'noapic' option.\n"); + return -ENODEV; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, + MAX_IO_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + return -ENODEV; + } else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_parse_int_src_ovr, nr_irqs); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + * But ignore setting up SCI on hardware reduced platforms. + */ + if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware) + acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, + acpi_gbl_FADT.sci_interrupt); + + /* Fill in identity legacy mappings where no override */ + mp_config_acpi_legacy_irqs(); + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, + acpi_parse_nmi_src, nr_irqs); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !CONFIG_X86_IO_APIC */ + +static void __init early_acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = early_acpi_parse_madt_lapic_addr_ovr(); + if (!error) { + acpi_lapic = 1; + smp_found_config = 1; + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif +} + +static void __init acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + + /* + * Parse MADT IO-APIC entries + */ + mutex_lock(&acpi_ioapic_lock); + error = acpi_parse_madt_ioapic_entries(); + mutex_unlock(&acpi_ioapic_lock); + if (!error) { + acpi_set_irq_model_ioapic(); + + smp_found_config = 1; + } + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } else { + /* + * ACPI found no MADT, and so ACPI wants UP PIC mode. + * In the event an MPS table was found, forget it. + * Boot with "acpi=off" to use MPS on such a system. + */ + if (smp_found_config) { + printk(KERN_WARNING PREFIX + "No APIC-table, disabling MPS\n"); + smp_found_config = 0; + } + } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); +#endif + return; +} + +static int __init disable_acpi_irq(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} + +static int __init disable_acpi_xsdt(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=rsdt\n", d->ident); + acpi_gbl_do_not_use_xsdt = TRUE; + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n"); + } + return 0; +} + +static int __init dmi_disable_acpi(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Force ignoring BIOS IRQ0 override + */ +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) +{ + if (!acpi_skip_timer_override) { + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", + d->ident); + acpi_skip_timer_override = 1; + } + return 0; +} + +/* + * ACPI offers an alternative platform interface model that removes + * ACPI hardware requirements for platforms that do not implement + * the PC Architecture. + * + * We initialize the Hardware-reduced ACPI model here: + */ +void __init acpi_generic_reduced_hw_init(void) +{ + /* + * Override x86_init functions and bypass legacy PIC in + * hardware reduced ACPI mode. + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; +} + +static void __init acpi_reduced_hw_init(void) +{ + if (acpi_gbl_reduced_hardware) + x86_init.acpi.reduced_hw_early_init(); +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact linux-acpi@vger.kernel.org + */ +static const struct dmi_system_id acpi_dmi_table[] __initconst = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, ""), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + { + /* + * Latest BIOS for IBM 600E (1.16) has bad pcinum + * for LPC bridge, which is needed for the PCI + * interrupt links to work. DSDT fix is in bug 5966. + * 2645, 2646 model numbers are shared with 600/600E/600X + */ + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2645", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2645"), + }, + }, + { + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2646", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2646"), + }, + }, + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + /* + * Boxes that need ACPI XSDT use disabled due to corrupted tables + */ + { + .callback = disable_acpi_xsdt, + .ident = "Advantech DAC-BJ01", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.12"), + DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"), + }, + }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static const struct dmi_system_id acpi_dmi_table_late[] __initconst = { + /* + * HP laptops which use a DSDT reporting as HP/SB400/10000, + * which includes some code which overrides all temperature + * trip points to 16C if the INTIN2 input of the I/O APIC + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 + * is not connected at all. Force ignoring BIOS IRQ0 + * override in that cases. + */ + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6125 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6325 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, + {} +}; + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + */ + +void __init acpi_boot_table_init(void) +{ + dmi_check_system(acpi_dmi_table); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return; + + /* + * Initialize the ACPI boot-time table parser. + */ + if (acpi_locate_initial_tables()) + disable_acpi(); + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + if (acpi_blacklisted()) { + if (acpi_force) { + printk(KERN_WARNING PREFIX "acpi=force override\n"); + } else { + printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + disable_acpi(); + return 1; + } + } + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + early_acpi_process_madt(); + + /* + * Hardware-reduced ACPI mode initialization: + */ + acpi_reduced_hw_init(); + + return 0; +} + +int __init acpi_boot_init(void) +{ + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return 1; + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt) + acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); + + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + + /* Do not enable ACPI SPCR console by default */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, false); + return 0; +} + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* acpi=rsdt use RSDT instead of XSDT */ + else if (strcmp(arg, "rsdt") == 0) { + acpi_gbl_do_not_use_xsdt = TRUE; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } + /* "acpi=copy_dsdt" copys DSDT */ + else if (strcmp(arg, "copy_dsdt") == 0) { + acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +static int __init parse_acpi_bgrt(char *arg) +{ + acpi_nobgrt = true; + return 0; +} +early_param("bgrt_disable", parse_acpi_bgrt); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +int __init acpi_mps_check(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) +/* mptable code is not built-in*/ + if (acpi_disabled || acpi_noirq) { + printk(KERN_WARNING "MPS support code is not built-in.\n" + "Using acpi=off or acpi=noirq or pci=noacpi " + "may have problem\n"); + return 1; + } +#endif + return 0; +} + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); + +static int __init parse_acpi_use_timer_override(char *arg) +{ + acpi_use_timer_override = 1; + return 0; +} +early_param("acpi_use_timer_override", parse_acpi_use_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "level")) + acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "high")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else if (!strcmp(s, "low")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); + +int __acpi_acquire_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); + val = cmpxchg(lock, old, new); + } while (unlikely (val != old)); + return ((new & 0x3) < 3) ? -1 : 0; +} + +int __acpi_release_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = old & ~0x3; + val = cmpxchg(lock, old, new); + } while (unlikely (val != old)); + return old & 0x1; +} + +void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + e820__range_add(addr, size, E820_TYPE_ACPI); + e820__update_table_print(); +} + +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + +u64 x86_default_get_root_pointer(void) +{ + return boot_params.acpi_rsdp_addr; +} diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c new file mode 100644 index 000000000..b961de569 --- /dev/null +++ b/arch/x86/kernel/acpi/cppc_msr.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * cppc_msr.c: MSR Interface for CPPC + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c new file mode 100644 index 000000000..d28d43d77 --- /dev/null +++ b/arch/x86/kernel/acpi/cstate.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2005 Intel Corporation + * Venkatesh Pallipadi + * - Added _PDC for SMP C-states on Intel CPUs + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Initialize bm_flags based on the CPU cache properties + * On SMP it depends on cache configuration + * - When cache is not shared among all CPUs, we flush cache + * before entering C3. + * - When cache is shared among all CPUs, we use bm_check + * mechanism as in UP case + * + * This routine is called only after all the CPUs are online + */ +void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + flags->bm_check = 0; + if (num_online_cpus() == 1) + flags->bm_check = 1; + else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) + flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All Zhaoxin CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + /* + * On all recent Zhaoxin platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state. + */ + flags->bm_control = 0; + } + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + /* + * For all AMD Zen or newer CPUs that support C3, caches + * should not be flushed by software while entering C3 + * type state. Set bm->check to 1 so that kernel doesn't + * need to execute cache flush operation. + */ + flags->bm_check = 1; + /* + * In current AMD C state implementation ARB_DIS is no longer + * used. So set bm_control to zero to indicate ARB_DIS is not + * required while entering C3 type state. + */ + flags->bm_control = 0; + } +} +EXPORT_SYMBOL(acpi_processor_power_init_bm_check); + +/* The code below handles cstate entry with monitor-mwait pair on Intel*/ + +struct cstate_entry { + struct { + unsigned int eax; + unsigned int ecx; + } states[ACPI_PROCESSOR_MAX_POWER]; +}; +static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ + +static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; + +#define NATIVE_CSTATE_BEYOND_HALT (2) + +static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) +{ + struct acpi_processor_cx *cx = _cx; + long retval; + unsigned int eax, ebx, ecx, edx; + unsigned int edx_part; + unsigned int cstate_type; /* C-state type and not ACPI C-state type */ + unsigned int num_cstate_subtype; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* Check whether this particular cx_type (in CST) is supported or not */ + cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1; + edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); + num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; + + retval = 0; + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", + cx->address, edx_part); + retval = -1; + goto out; + } + + /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) { + retval = -1; + goto out; + } + + if (!mwait_supported[cstate_type]) { + mwait_supported[cstate_type] = 1; + printk(KERN_DEBUG + "Monitor-Mwait will be used to enter C-%d state\n", + cx->type); + } + snprintf(cx->desc, + ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x", + cx->address); +out: + return retval; +} + +int acpi_processor_ffh_cstate_probe(unsigned int cpu, + struct acpi_processor_cx *cx, struct acpi_power_register *reg) +{ + struct cstate_entry *percpu_entry; + struct cpuinfo_x86 *c = &cpu_data(cpu); + long retval; + + if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + return -1; + + if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) + return -1; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + percpu_entry->states[cx->index].eax = 0; + percpu_entry->states[cx->index].ecx = 0; + + /* Make sure we are running on right CPU */ + + retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx, + false); + if (retval == 0) { + /* Use the hint in CST */ + percpu_entry->states[cx->index].eax = cx->address; + percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; + } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + + return retval; +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); + +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_idle_with_hints(percpu_entry->states[cx->index].eax, + percpu_entry->states[cx->index].ecx); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); + +static int __init ffh_cstate_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_INTEL && + c->x86_vendor != X86_VENDOR_AMD) + return -1; + + cpu_cstate_entry = alloc_percpu(struct cstate_entry); + return 0; +} + +static void __exit ffh_cstate_exit(void) +{ + free_percpu(cpu_cstate_entry); + cpu_cstate_entry = NULL; +} + +arch_initcall(ffh_cstate_init); +__exitcall(ffh_cstate_exit); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 000000000..cc1fea76a --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "../../realmode/rm/wakeup.h" +#include "sleep.h" + +unsigned long acpi_realmode_flags; + +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +static char temp_stack[4096]; +#endif + +/** + * acpi_get_wakeup_address - provide physical address for S3 wakeup + * + * Returns the physical address where the kernel should be resumed after the + * system awakes from S3, e.g. for programming into the firmware waking vector. + */ +unsigned long acpi_get_wakeup_address(void) +{ + return ((unsigned long)(real_mode_header->wakeup_start)); +} + +/** + * x86_acpi_enter_sleep_state - enter sleep state + * @state: Sleep state to enter. + * + * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + */ +asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) +{ + return acpi_enter_sleep_state(state); +} + +/** + * x86_acpi_suspend_lowlevel - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int x86_acpi_suspend_lowlevel(void) +{ + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header->wakeup_header); + + if (header->signature != WAKEUP_HEADER_SIGNATURE) { + printk(KERN_ERR "wakeup header does not match\n"); + return -EINVAL; + } + + header->video_mode = saved_video_mode; + + header->pmode_behavior = 0; + +#ifndef CONFIG_64BIT + native_store_gdt((struct desc_ptr *)&header->pmode_gdt); + + /* + * We have to check that we can write back the value, and not + * just read it. At least on 90 nm Pentium M (Family 6, Model + * 13), reading an invalid MSR is not guaranteed to trap, see + * Erratum X4 in "Intel Pentium M Processor on 90 nm Process + * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90 + * nm process with 512-KB L2 Cache Specification Update". + */ + if (!rdmsr_safe(MSR_EFER, + &header->pmode_efer_low, + &header->pmode_efer_high) && + !wrmsr_safe(MSR_EFER, + header->pmode_efer_low, + header->pmode_efer_high)) + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); +#endif /* !CONFIG_64BIT */ + + header->pmode_cr0 = read_cr0(); + if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { + header->pmode_cr4 = __read_cr4(); + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); + } + if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, + &header->pmode_misc_en_low, + &header->pmode_misc_en_high) && + !wrmsr_safe(MSR_IA32_MISC_ENABLE, + header->pmode_misc_en_low, + header->pmode_misc_en_high)) + header->pmode_behavior |= + (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); + header->realmode_flags = acpi_realmode_flags; + header->real_magic = 0x12345678; + +#ifndef CONFIG_64BIT + header->pmode_entry = (u32)&wakeup_pmode_return; + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); + saved_magic = 0x12345678; +#else /* CONFIG_64BIT */ +#ifdef CONFIG_SMP + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); + initial_gs = per_cpu_offset(smp_processor_id()); +#endif + initial_code = (unsigned long)wakeup_long64; + saved_magic = 0x123456789abcdef0L; +#endif /* CONFIG_64BIT */ + + /* + * Pause/unpause graph tracing around do_suspend_lowlevel as it has + * inconsistent call/return info after it jumps to the wakeup vector. + */ + pause_graph_tracing(); + do_suspend_lowlevel(); + unpause_graph_tracing(); + return 0; +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); + if (strncmp(str, "nobl", 4) == 0) + acpi_sleep_no_blacklist(); + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h new file mode 100644 index 000000000..171a40c74 --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Variables and functions used by the code in sleep.c + */ + +#include + +extern unsigned long saved_video_mode; +extern long saved_magic; + +extern int wakeup_pmode_return; + +extern u8 wake_sleep_flags; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); +extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); + +extern int x86_acpi_suspend_lowlevel(void); + +asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S new file mode 100644 index 000000000..cf6908107 --- /dev/null +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + .text +#include +#include +#include + +# Copyright 2003, 2008 Pavel Machek +#include +#include +#include +#include +#include +#include + +# Copyright 2003 Pavel Machek +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int __read_mostly alternatives_patched; + +EXPORT_SYMBOL_GPL(alternatives_patched); + +#define MAX_PATCH_LEN (255-1) + +static int __initdata_or_module debug_alternative; + +static int __init debug_alt(char *str) +{ + debug_alternative = 1; + return 1; +} +__setup("debug-alternative", debug_alt); + +static int noreplace_smp; + +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ +} while (0) + +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ +} while (0) + +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#ifdef K8_NOP1 +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#ifdef P6_NOP1 +static const unsigned char p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ + NULL, + p6nops, + p6nops + 1, + p6nops + 1 + 2, + p6nops + 1 + 2 + 3, + p6nops + 1 + 2 + 3 + 4, + p6nops + 1 + 2 + 3 + 4 + 5, + p6nops + 1 + 2 + 3 + 4 + 5 + 6, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +/* Initialize these to a safe default */ +#ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif + +void __init arch_init_ideal_nops(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } + break; + + case X86_VENDOR_HYGON: + ideal_nops = p6_nops; + return; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + fallthrough; + + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } +} + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, ideal_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; +void text_poke_early(void *addr, const void *opcode, size_t len); + +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insn_buff + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insn_buff[0] = 0xeb; + insn_buff[1] = (s8)n_dspl; + add_nops(insn_buff + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + + insn_buff[0] = 0xe9; + *(s32 *)&insn_buff[1] = n_dspl; + + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + +/* + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * + * @instr: instruction byte stream + * @instrlen: length of the above + * @off: offset within @instr where the first NOP has been detected + * + * Return: number of NOPs found (and replaced). + */ +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) +{ + unsigned long flags; + int i = off, nnops; + + while (i < instrlen) { + if (instr[i] != 0x90) + break; + + i++; + } + + nnops = i - off; + + if (nnops <= 1) + return nnops; + + local_irq_save(flags); + add_nops(instr + off, nnops); + local_irq_restore(flags); + + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + + return nnops; +} + +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +{ + struct insn insn; + int i = 0; + + /* + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger + * ones. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + /* + * See if this and any potentially following NOPs can be + * optimized. + */ + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + i += optimize_nops_range(instr, len, i); + else + i += insn.length; + + if (i >= len) + return; + } +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + * + * Marked "noinline" to cause control flow change and thus insn cache + * to refetch changed I$ lines. + */ +void __init_or_module noinline apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insn_buff[MAX_PATCH_LEN]; + + DPRINTK("alt table %px, -> %px", start, end); + + /* + * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ + for (a = start; a < end; a++) { + int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; + + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + BUG_ON(a->instrlen > sizeof(insn_buff)); + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); + + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; + + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, + instr, instr, a->instrlen, + replacement, a->replacementlen); + + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + + memcpy(insn_buff, replacement, a->replacementlen); + insn_buff_sz = a->replacementlen; + + /* + * 0xe8 is a relative jump; fix the offset. + * + * Instruction length is checked before the opcode to avoid + * accessing uninitialized bytes for zero-length replacements. + */ + if (a->replacementlen == 5 && *insn_buff == 0xe8) { + *(s32 *)(insn_buff + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insn_buff + 1), + (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); + } + + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insn_buff); + + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + + text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(instr, a->instrlen); + } + + kasan_enable_current(); +} + +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = 0x90; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#ifdef CONFIG_RETHUNK +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return -1; + + bytes[i++] = RET_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + +#ifdef CONFIG_SMP +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn DS segment override prefix into lock prefix */ + if (*ptr == 0x3e) + text_poke(ptr, ((unsigned char []){0xf0}), 1); + } +} + +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn lock prefix into DS segment override prefix */ + if (*ptr == 0xf0) + text_poke(ptr, ((unsigned char []){0x3E}), 1); + } +} + +struct smp_alt_module { + /* what is this ??? */ + struct module *mod; + char *name; + + /* ptrs to lock prefixes */ + const s32 *locks; + const s32 *locks_end; + + /* .text segment, needed to avoid patching init code ;) */ + u8 *text; + u8 *text_end; + + struct list_head next; +}; +static LIST_HEAD(smp_alt_modules); +static bool uniproc_patched = false; /* protected by text_mutex */ + +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) +{ + struct smp_alt_module *smp; + + mutex_lock(&text_mutex); + if (!uniproc_patched) + goto unlock; + + if (num_possible_cpus() == 1) + /* Don't bother remembering, we'll never have to undo it. */ + goto smp_unlock; + + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (NULL == smp) + /* we'll run the (safe but slow) SMP code then ... */ + goto unlock; + + smp->mod = mod; + smp->name = name; + smp->locks = locks; + smp->locks_end = locks_end; + smp->text = text; + smp->text_end = text_end; + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, + smp->text, smp->text_end, smp->name); + + list_add_tail(&smp->next, &smp_alt_modules); +smp_unlock: + alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock: + mutex_unlock(&text_mutex); +} + +void __init_or_module alternatives_smp_module_del(struct module *mod) +{ + struct smp_alt_module *item; + + mutex_lock(&text_mutex); + list_for_each_entry(item, &smp_alt_modules, next) { + if (mod != item->mod) + continue; + list_del(&item->next); + kfree(item); + break; + } + mutex_unlock(&text_mutex); +} + +void alternatives_enable_smp(void) +{ + struct smp_alt_module *mod; + + /* Why bother if there are no other CPUs? */ + BUG_ON(num_possible_cpus() == 1); + + mutex_lock(&text_mutex); + + if (uniproc_patched) { + pr_info("switching to SMP code\n"); + BUG_ON(num_online_cpus() != 1); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_lock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + uniproc_patched = false; + } + mutex_unlock(&text_mutex); +} + +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + const s32 *poff; + u8 *text_start = start; + u8 *text_end = end; + + lockdep_assert_held(&text_mutex); + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) + return 1; + } + } + + return 0; +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PARAVIRT +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{ + struct paravirt_patch_site *p; + char insn_buff[MAX_PATCH_LEN]; + + for (p = start; p < end; p++) { + unsigned int used; + + BUG_ON(p->len > MAX_PATCH_LEN); + /* prep the buffer with the original instructions */ + memcpy(insn_buff, p->instr, p->len); + used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + + BUG_ON(used > p->len); + + /* Pad the rest with nops */ + add_nops(insn_buff + used, p->len - used); + text_poke_early(p->instr, insn_buff, p->len); + } +} +extern struct paravirt_patch_site __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ + +/* + * We define the int3_magic() function in assembly to control the calling + * convention such that we can 'call' it from assembly. + */ + +extern void int3_magic(unsigned int *ptr); /* defined in asm */ + +asm ( +" .pushsection .init.text, \"ax\", @progbits\n" +" .type int3_magic, @function\n" +"int3_magic:\n" +" movl $1, (%" _ASM_ARG1 ")\n" + ASM_RET +" .size int3_magic, .-int3_magic\n" +" .popsection\n" +); + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,\"aw\"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : ASM_CALL_CONSTRAINT + : __ASM_SEL_RAW(a, D) (&val) + : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + +void __init alternative_instructions(void) +{ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ + stop_nmi(); + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than an unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ + + /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); + + apply_alternatives(__alt_instructions, __alt_instructions_end); + +#ifdef CONFIG_SMP + /* Patch to UP if other cpus not imminent. */ + if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { + uniproc_patched = true; + alternatives_smp_module_add(NULL, "core kernel", + __smp_locks, __smp_locks_end, + _text, _etext); + } + + if (!uniproc_patched || num_possible_cpus() == 1) { + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); + } +#endif + + apply_paravirt(__parainstructions, __parainstructions_end); + + restart_nmi(); + alternatives_patched = 1; +} + +/** + * text_poke_early - Update instructions on a live kernel at boot time + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. + */ +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + sync_core(); + local_irq_restore(flags); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate.is_lazy)) + leave_mm(smp_processor_id()); + + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + local_irq_save(flags); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + local_irq_restore(flags); + pte_unmap_unlock(ptep, ptl); + return addr; +} + +/** + * text_poke - Update instructions on a live kernel + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. + */ +void *text_poke(void *addr, const void *opcode, size_t len) +{ + lockdep_assert_held(&text_mutex); + + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); +} + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +struct text_poke_loc { + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ + u8 old; +}; + +struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; + atomic_t refs; +}; + +static struct bp_patching_desc bp_desc; + +static __always_inline +struct bp_patching_desc *try_get_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + if (!arch_atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static __always_inline void put_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + smp_mb__before_atomic(); + arch_atomic_dec(&desc->refs); +} + +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static __always_inline int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < text_poke_addr(tp)) + return -1; + if (key > text_poke_addr(tp)) + return 1; + return 0; +} + +noinstr int poke_int3_handler(struct pt_regs *regs) +{ + struct bp_patching_desc *desc; + struct text_poke_loc *tp; + int ret = 0; + void *ip; + + if (user_mode(regs)) + return 0; + + /* + * Having observed our INT3 instruction, we now must observe + * bp_desc with non-zero refcount: + * + * bp_desc.refs = 1 INT3 + * WMB RMB + * write INT3 if (bp_desc.refs != 0) + */ + smp_rmb(); + + desc = try_get_desc(); + if (!desc) + return 0; + + /* + * Discount the INT3. See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - INT3_INSN_SIZE; + + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(desc->nr_entries > 1)) { + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + goto out_put; + } else { + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; + } + + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + goto out_put; + + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->disp); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->disp); + break; + + default: + BUG(); + } + + ret = 1; + +out_put: + put_desc(); + return ret; +} + +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + +/** + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector + * + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. + * + * The way it is done: + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched + * - sync cores + * - For each entry in the vector: + * - update all but the first byte of the patched range + * - sync cores + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + */ +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +{ + unsigned char int3 = INT3_INSN_OPCODE; + unsigned int i; + int do_sync; + + lockdep_assert_held(&text_mutex); + + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); + + /* + * Corresponding read barrier in int3 notifier for making sure the + * nr_entries and handler are correctly ordered wrt. patching. + */ + smp_wmb(); + + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } + + text_poke_sync(); + + /* + * Second step: update all but the first byte of the patched range. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; + int len = tp[i].len; + + if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + (const char *)tp[i].text + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + do_sync++; + } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); + } + + if (do_sync) { + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + text_poke_sync(); + } + + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); + do_sync++; + } + + if (do_sync) + text_poke_sync(); + + /* + * Remove and wait for refs to be zero. + */ + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); +} + +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + int ret, i; + + memcpy((void *)tp->text, opcode, len); + if (!emulate) + emulate = opcode; + + ret = insn_decode_kernel(&insn, emulate); + BUG_ON(ret < 0); + + tp->rel_addr = addr - (void *)_stext; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + }; + + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->disp = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->disp = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->disp = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); +} + +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. + */ +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc tp; + + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + + text_poke_loc_init(&tp, addr, opcode, len, emulate); + text_poke_bp_batch(&tp, 1); +} diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c new file mode 100644 index 000000000..9ac696487 --- /dev/null +++ b/arch/x86/kernel/amd_gart_64.c @@ -0,0 +1,845 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/core-api/dma-api-howto.rst for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +static u32 *iommu_gatt_base; /* Remapping table */ + +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ +static int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area: */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static bool need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) +{ + unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, align_mask); + if (offset == -1) { + need_flush = true; + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, + align_mask); + } + if (offset != -1) { + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = true; + } + } + if (iommu_fullflush) + need_flush = true; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + bitmap_clear(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + amd_flush_garts(); + need_flush = false; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void dump_leak(void) +{ + static int dump; + + if (dump) + return; + dump = 1; + + show_stack(NULL, NULL, KERN_ERR); + debug_dma_dump_mappings(NULL); +} +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return force_iommu || !dma_capable(dev, addr, size, true); +} + +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return !dma_capable(dev, addr, size, true); +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir, unsigned long align_mask) +{ + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page; + int i; + + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return DMA_MAPPING_ERROR; + + iommu_page = alloc_iommu(dev, npages, align_mask); + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return DMA_MAPPING_ERROR; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +static dma_addr_t gart_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long bus; + phys_addr_t paddr = page_to_phys(page) + offset; + + if (!need_iommu(dev, paddr, size)) + return paddr; + + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); + + return bus; +} + +/* + * Free a DMA mapping. + */ +static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long iommu_page; + int npages; + int i; + + if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR)) + return; + + /* + * This driver will not always use a GART mapping, but might have + * created a direct mapping instead. If that is the case there is + * nothing to unmap here. + */ + if (dma_addr < iommu_bus_base || + dma_addr >= iommu_bus_base + iommu_size) + return; + + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + if (!s->dma_length || !s->length) + break; + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + struct scatterlist *s; + int i; + +#ifdef CONFIG_IOMMU_DEBUG + pr_debug("dma_map_sg overflow\n"); +#endif + + for_each_sg(sg, s, nents, i) { + unsigned long addr = sg_phys(s); + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == DMA_MAPPING_ERROR) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, 0); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(dev, pages, 0); + unsigned long iommu_page = iommu_start; + struct scatterlist *s; + int i; + + if (iommu_start == -1) + return -1; + + for_each_sg(start, s, nelems, i) { + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(s != start && s->offset); + if (s == start) { + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + + return 0; +} + +static inline int +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) +{ + if (!need) { + BUG_ON(nelems != 1); + sout->dma_address = start->dma_address; + sout->dma_length = start->length; + return 0; + } + return __dma_map_cont(dev, start, nelems, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; + + if (nents == 0) + return 0; + + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + + for_each_sg(sg, s, nents, i) { + dma_addr_t addr = sg_phys(s); + + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ + if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need) < 0) + goto error; + out++; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; + } + } + + seg_size += s->length; + need = nextneed; + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); + ps = s; + } + if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) + goto error; + out++; + flush_gart(); + if (out < nents) { + sgmap = sg_next(sgmap); + sgmap->dma_length = 0; + } + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, out, dir, 0); + + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) + s->dma_address = DMA_MAPPING_ERROR; + return 0; +} + +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs) +{ + void *vaddr; + + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); + if (!vaddr || + !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) + return vaddr; + + *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size, + DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1); + flush_gart(); + if (unlikely(*dma_addr == DMA_MAPPING_ERROR)) + goto out_free; + return vaddr; +out_free: + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, unsigned long attrs) +{ + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) { + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; + u64 aper_base; + + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +static void enable_gart_translations(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + enable_gart_translation(dev, __pa(agp_gatt_table)); + } + + /* Flush the GART-TLB to remove stale entries */ + amd_flush_garts(); +} + +/* + * If fix_up_north_bridges is set, the north bridges have to be fixed up on + * resume in the same way as they are handled in gart_iommu_hole_init(). + */ +static bool fix_up_north_bridges; +static u32 aperture_order; +static u32 aperture_alloc; + +void set_up_gart_resume(u32 aper_order, u32 aper_alloc) +{ + fix_up_north_bridges = true; + aperture_order = aper_order; + aperture_alloc = aper_alloc; +} + +static void gart_fixup_northbridges(void) +{ + int i; + + if (!fix_up_north_bridges) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + pr_info("PCI-DMA: Restoring GART aperture settings\n"); + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + gart_set_size_and_enable(dev, aperture_order); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); + } +} + +static void gart_resume(void) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(); + + enable_gart_translations(); +} + +static struct syscore_ops gart_syscore_ops = { + .resume = gart_resume, + +}; + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_amd_gatt(struct agp_kern_info *info) +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; + struct pci_dev *dev; + void *gatt; + int i; + + pr_info("PCI-DMA: Disabling AGP.\n"); + + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + + info->aper_base = aper_base; + info->aper_size = aper_size >> 20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) + panic("Could not set GART PTEs to uncacheable pages"); + + agp_gatt_table = gatt; + + register_syscore_ops(&gart_syscore_ops); + + flush_gart(); + + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); + + return 0; + + nommu: + /* Should not happen anymore */ + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); + return -1; +} + +static const struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, + .map_page = gart_map_page, + .unmap_page = gart_unmap_page, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, + .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, +}; + +static void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + /* don't shutdown it if there is AGP installed */ + if (!no_agp) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + u32 ctl; + + dev = node_to_amd_nb(i)->misc; + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + + ctl &= ~GARTEN; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + } +} + +int __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long iommu_start; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; + unsigned long scratch; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other AMD AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || + (no_agp && init_amd_gatt(&info) < 0)) { + if (max_pfn > MAX_DMA32_PFN) { + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); + } + return 0; + } + + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) + init_memory_mapping(start_pfn<> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size >> 20); + + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); + + /* + * Try to workaround a bug (thanks to BenH): + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. (P2P bridge may be prefetching on DMA reads). + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + + flush_gart(); + dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; +} + +void __init gart_parse_options(char *p) +{ + int arg; + + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush", 9)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; + if (!strncmp(p, "noagp", 5)) + no_agp = 1; + if (!strncmp(p, "noaperture", 10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p, "force", 5)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "allowed", 7)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} +IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c new file mode 100644 index 000000000..18f6b7c4b --- /dev/null +++ b/arch/x86/kernel/amd_nb.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 +#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec +#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 + +/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +static DEFINE_MUTEX(smn_mutex); + +static u32 *flush_words; + +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + {} +}; + +#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 + +static const struct pci_device_id amd_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + {} +}; + +static const struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + {} +}; + +static const struct pci_device_id hygon_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + +static const struct pci_device_id hygon_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + {} +}; + +static const struct pci_device_id hygon_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + {} +}; + +const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { + { 0x00, 0x18, 0x20 }, + { 0xff, 0x00, 0x20 }, + { 0xfe, 0x00, 0x20 }, + { } +}; + +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); + +static struct pci_dev *next_northbridge(struct pci_dev *dev, + const struct pci_device_id *ids) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(ids, dev)); + return dev; +} + +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} +EXPORT_SYMBOL_GPL(amd_df_indirect_read); + +int amd_cache_northbridges(void) +{ + const struct pci_device_id *misc_ids = amd_nb_misc_ids; + const struct pci_device_id *link_ids = amd_nb_link_ids; + const struct pci_device_id *root_ids = amd_root_ids; + struct pci_dev *root, *misc, *link; + struct amd_northbridge *nb; + u16 roots_per_misc = 0; + u16 misc_count = 0; + u16 root_count = 0; + u16 i, j; + + if (amd_northbridges.num) + return 0; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + root_ids = hygon_root_ids; + misc_ids = hygon_nb_misc_ids; + link_ids = hygon_nb_link_ids; + } + + misc = NULL; + while ((misc = next_northbridge(misc, misc_ids)) != NULL) + misc_count++; + + if (!misc_count) + return -ENODEV; + + root = NULL; + while ((root = next_northbridge(root, root_ids)) != NULL) + root_count++; + + if (root_count) { + roots_per_misc = root_count / misc_count; + + /* + * There should be _exactly_ N roots for each DF/SMN + * interface. + */ + if (!roots_per_misc || (root_count % roots_per_misc)) { + pr_info("Unsupported AMD DF/PCI configuration found\n"); + return -ENODEV; + } + } + + nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + amd_northbridges.nb = nb; + amd_northbridges.num = misc_count; + + link = misc = root = NULL; + for (i = 0; i < amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, root_ids); + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, link_ids); + + /* + * If there are more PCI root devices than data fabric/ + * system management network interfaces, then the (N) + * PCI roots per DF/SMN interface are functionally the + * same (for DF/SMN access) and N-1 are redundant. N-1 + * PCI roots should be skipped per DF/SMN interface so + * the following DF/SMN interfaces get mapped to + * correct PCI roots. + */ + for (j = 1; j < roots_per_misc; j++) + root = next_northbridge(root, root_ids); + } + + if (amd_gart_present()) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_stepping >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_cache_northbridges); + +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) +{ + const struct pci_device_id *misc_ids = amd_nb_misc_ids; + const struct pci_device_id *id; + u32 vendor = device & 0xffff; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + misc_ids = hygon_nb_misc_ids; + + device >>= 16; + for (id = misc_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return true; + return false; +} + +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned int segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + + return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf; +} + +int amd_set_subcaches(int cpu, unsigned long mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + + cuid = cpu_data(cpu).cpu_core_id; + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + + return 0; +} + +static void amd_cache_gart(void) +{ + u16 i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + return; + } + + for (i = 0; i != amd_northbridges.num; i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); +} + +void amd_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < amd_northbridges.num; i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); + flushed++; + } + for (i = 0; i < amd_northbridges.num; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(node_to_amd_nb(i)->misc, + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + pr_notice("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(amd_flush_garts); + +static void __fix_erratum_688(void *info) +{ +#define MSR_AMD64_IC_CFG 0xC0011021 + + msr_set_bit(MSR_AMD64_IC_CFG, 3); + msr_set_bit(MSR_AMD64_IC_CFG, 14); +} + +/* Apply erratum 688 fix so machines without a BIOS fix work. */ +static __init void fix_erratum_688(void) +{ + struct pci_dev *F4; + u32 val; + + if (boot_cpu_data.x86 != 0x14) + return; + + if (!amd_northbridges.num) + return; + + F4 = node_to_amd_nb(0)->link; + if (!F4) + return; + + if (pci_read_config_dword(F4, 0x164, &val)) + return; + + if (val & BIT(2)) + return; + + on_each_cpu(__fix_erratum_688, NULL, 0); + + pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n"); +} + +static __init int init_amd_nbs(void) +{ + amd_cache_northbridges(); + amd_cache_gart(); + + fix_erratum_688(); + + return 0; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 000000000..263eeaddb --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define APBT_CLOCKEVENT_RATING 110 +#define APBT_CLOCKSOURCE_RATING 250 + +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKSOURCE_NUM (2) + +static phys_addr_t apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; + +/* + * Common DW APB timer info + */ +static unsigned long apbt_freq; + +struct apbt_dev { + struct dw_apb_clock_event_device *timer; + unsigned int num; + int cpu; + unsigned int irq; + char name[10]; +}; + +static struct dw_apb_clocksource *clocksource_apbt; + +static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) +{ + return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; +} + +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +#endif + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + int phy_cs_timer_id = 0; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (phys_addr_t)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE); + if (!apbt_virt_address) { + pr_debug("Failed mapping APBT phy address at %lu\n",\ + (unsigned long)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + pr_debug("Use timer %d for clocksource\n", + (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / + APBTMRS_REG_SIZE; + + clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, + "apbt0", apbt_virt_address + phy_cs_timer_id * + APBTMRS_REG_SIZE, apbt_freq); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + adev->num = smp_processor_id(); + adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", + intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? + APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, + adev_virt_addr(adev), 0, apbt_freq); + /* Firmware does EOI handling for us. */ + adev->timer->eoi = NULL; + + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { + global_clock_event = &adev->timer->ced; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + dw_apb_clockevent_register(adev->timer); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP + +static void apbt_setup_irq(struct apbt_dev *adev) +{ + irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); +} + +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (!cpu) + return; + + adev = this_cpu_ptr(&cpu_apbt_dev); + if (!adev->timer) { + adev->timer = dw_apb_clockevent_init(cpu, adev->name, + APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), + adev->irq, apbt_freq); + adev->timer->eoi = NULL; + } else { + dw_apb_clockevent_resume(adev->timer); + } + + printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", + cpu, adev->name, adev->cpu); + + apbt_setup_irq(adev); + dw_apb_clockevent_register(adev->timer); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpu_dead(unsigned int cpu) +{ + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + dw_apb_clockevent_pause(adev->timer); + if (system_state == SYSTEM_RUNNING) { + pr_debug("skipping APBT CPU %u offline\n", cpu); + } else { + pr_debug("APBT clockevent for cpu %u offline\n", cpu); + dw_apb_clockevent_stop(adev->timer); + } + return 0; +} + +static __init int apbt_late_init(void) +{ + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) + return 0; + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, + apbt_cpu_dead); +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static int apbt_clocksource_register(void) +{ + u64 start, now; + u64 t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + dw_apb_clocksource_start(clocksource_apbt); + + /* Verify whether apbt counter works */ + t1 = dw_apb_clocksource_read(clocksource_apbt); + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == dw_apb_clocksource_read(clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + dw_apb_clocksource_register(clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (!apbt_virt_address) + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) + apbt_num_timers_used = num_possible_cpus(); + else + apbt_num_timers_used = 1; + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) + adev->irq = p_mtmr->irq; + else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c new file mode 100644 index 000000000..294ed4392 --- /dev/null +++ b/arch/x86/kernel/aperture_64.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ +#define pr_fmt(fmt) "AGP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + +int gart_iommu_aperture; +int gart_iommu_aperture_disabled __initdata; +int gart_iommu_aperture_allowed __initdata; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata; + +int fix_aperture __initdata = 1; + +#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE) +/* + * If the first kernel maps the aperture over e820 RAM, the kdump kernel will + * use the same range because it will remain configured in the northbridge. + * Trying to dump this area via /proc/vmcore may crash the machine, so exclude + * it from vmcore. + */ +static unsigned long aperture_pfn_start, aperture_page_count; + +static int gart_mem_pfn_is_ram(unsigned long pfn) +{ + return likely((pfn < aperture_pfn_start) || + (pfn >= aperture_pfn_start + aperture_page_count)); +} + +static void __init exclude_from_core(u64 aper_base, u32 aper_order) +{ + aperture_pfn_start = aper_base >> PAGE_SHIFT; + aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; +#ifdef CONFIG_PROC_VMCORE + WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +#ifdef CONFIG_PROC_KCORE + WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); +#endif +} +#else +static void exclude_from_core(u64 aper_base, u32 aper_order) +{ +} +#endif + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +static u32 __init allocate_aperture(void) +{ + u32 aper_size; + unsigned long addr; + + /* aper_size should <= 1G */ + if (fallback_aper_order > 5) + fallback_aper_order = 5; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. + */ + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (!addr) { + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + return 0; + } + memblock_reserve(addr, aper_size); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); + + return (u32)addr; +} + + +/* Find a PCI capability */ +static u32 __init find_cap(int bus, int slot, int func, int cap) +{ + int bytes; + u8 pos; + + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(bus, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + u32 old_order; + + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); + apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); + if (apsizereg == 0xffffffff) { + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); + return 0; + } + + /* old_order could be the value from NB gart setting */ + old_order = *order; + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(bus, slot, func, 0x10); + aper_hi = read_pci_config(bus, slot, func, 0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + /* + * On some sick chips, APSIZE is 0. It means it wants 4G + * so let double check that order, and lets trust AMD NB settings: + */ + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); + if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); + *order = old_order; + } + + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); + + if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) + return 0; + return (u32)aper; +} + +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ +static u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int bus, slot, func; + + /* Poor man's PCI discovery */ + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(bus, slot, func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(bus, slot, func, + PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(bus, slot, func, cap, + order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(bus, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + pr_info("No AGP bridge found\n"); + + return 0; +} + +static bool gart_fix_e820 __initdata = true; + +static int __init parse_gart_mem(char *p) +{ + return kstrtobool(p, &gart_fix_e820); +} +early_param("gart_fix_e820", parse_gart_mem); + +/* + * With kexec/kdump, if the first kernel doesn't shut down the GART and the + * second kernel allocates a different GART region, there might be two + * overlapping GART regions present: + * + * - the first still used by the GART initialized in the first kernel. + * - (sub-)set of it used as normal RAM by the second kernel. + * + * which leads to memory corruptions and a kernel panic eventually. + * + * This can also happen if the BIOS has forgotten to mark the GART region + * as reserved. + * + * Try to update the e820 map to mark that new region as reserved. + */ +void __init early_gart_iommu_check(void) +{ + u32 agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; + + if (!amd_gart_present()) + return; + + if (!early_pci_allowed()) + return; + + /* This is mostly duplicate of iommu_hole_init */ + search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + aper_enabled = ctl & GARTEN; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + if (last_valid) { + if ((aper_order != last_aper_order) || + (aper_base != last_aper_base) || + (aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + } + + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + last_valid = 1; + } + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820__mapped_any(aper_base, aper_base + aper_size, + E820_TYPE_RAM)) { + /* reserve it, so we can reuse it in second kernel */ + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); + e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED); + e820__update_table_print(); + } + } + + if (valid_agp) + return; + + /* disable them all at first */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + } + } + +} + +static int __initdata printed_gart_size_msg; + +int __init gart_iommu_hole_init(void) +{ + u32 agp_aper_base = 0, agp_aper_order = 0; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int fix, slot, valid_agp = 0; + int i, node; + + if (!amd_gart_present()) + return -ENODEV; + + if (gart_iommu_aperture_disabled || !fix_aperture || + !early_pci_allowed()) + return -ENODEV; + + pr_info("Checking aperture...\n"); + + if (!fallback_aper_force) + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + node = 0; + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + u32 ctl; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + iommu_detected = 1; + gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; + + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); + node++; + + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && + max_pfn > MAX_DMA32_PFN && + !printed_gart_size_msg) { + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + goto out; + } + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + } + +out: + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + /* + * If this is the kdump kernel, the first kernel + * may have allocated the range over its e820 RAM + * and fixed up the northbridge + */ + exclude_from_core(last_aper_base, last_aper_order); + + return 1; + } + return 0; + } + + if (!fallback_aper_force) { + aper_alloc = agp_aper_base; + aper_order = agp_aper_order; + } + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || + valid_agp || + fallback_aper_force) { + pr_info("Your BIOS doesn't leave an aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ + panic("Not enough memory for aperture"); + } + } else { + return 0; + } + + /* + * If this is the kdump kernel _and_ the first kernel did not + * configure the aperture in the northbridge, this range may + * overlap with the first kernel's memory. We can't access the + * range through vmcore even though it should be part of the dump. + */ + exclude_from_core(aper_alloc, aper_order); + + /* Fix up the north bridges */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = aper_order << 1; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); + } + } + + set_up_gart_resume(aper_order, aper_alloc); + + return 1; +} diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile new file mode 100644 index 000000000..a6fcaf16c --- /dev/null +++ b/arch/x86/kernel/apic/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for local APIC drivers and for the IO-APIC code +# + +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o +obj-y += hw_nmi.o + +obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_PCI_MSI) += msi.o +obj-$(CONFIG_SMP) += ipi.o + +ifeq ($(CONFIG_X86_64),y) +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o +obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o +endif + +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c new file mode 100644 index 000000000..25eb69f26 --- /dev/null +++ b/arch/x86/kernel/apic/apic.c @@ -0,0 +1,2896 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000, 2009 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int num_processors; + +unsigned disabled_cpus; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; +EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); + +u8 boot_cpu_apic_version __ro_after_init; + +/* + * The highest APIC ID seen during enumeration. + */ +static unsigned int max_physical_apicid; + +/* + * Bitmask of physically existing CPUs: + */ +physid_mask_t phys_cpu_present_map; + +/* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; + +/* + * This variable controls which CPUs receive external NMIs. By default, + * external NMIs are delivered only to the BSP. + */ +static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; + +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); + +#ifdef CONFIG_X86_32 + +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase __ro_after_init; + +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} +#endif + +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (IS_ENABLED(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (arg && !strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 1; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +unsigned long mp_lapic_addr __ro_after_init; +int disable_apic __ro_after_init; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __initdata; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok __ro_after_init; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity __ro_after_init; + +int pic_mode __ro_after_init; + +/* Have we found an MP table */ +int smp_found_config __ro_after_init; + +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +unsigned int lapic_timer_period = 0; + +static void apic_pm_activate(void); + +static unsigned long apic_phys __ro_after_init; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a separate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + + /* Hygon systems use modern APIC */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return 1; + + return lapic_get_version() >= 0x14; +} + +/* + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything + */ +static void __init apic_disable(void) +{ + pr_info("APIC: switched to apic NOOP\n"); + apic = &apic_noop; +} + +void native_apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 native_safe_apic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +void native_apic_icr_write(u32 low, u32 id) +{ + unsigned long flags; + + local_irq_save(flags); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); + local_irq_restore(flags); +} + +u64 native_apic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor */ +#define APIC_DIVISOR 16 +#define TSC_DIVISOR 8 + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + /* + * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode, + * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized. + * According to Intel, MFENCE can do the serialization here. + */ + asm volatile("mfence" : : : "memory"); + return; + } + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Setup extended LVT, AMD specific + * + * Software should use the LVT offsets the BIOS provides. The offsets + * are determined by the subsystems using it like those for MCE + * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts + * are supported. Beginning with family 10h at least 4 offsets are + * available. + * + * Since the offsets must be consistent for all cores, we keep track + * of the LVT offsets in software and reserve the offset for the same + * vector also to be used on other cores. An offset is freed by + * setting the entry to APIC_EILVT_MASKED. + * + * If the BIOS is right, there should be no conflicts. Otherwise a + * "[Firmware Bug]: ..." error message is generated. However, if + * software does not properly determines the offsets, it is not + * necessarily a BIOS bug. + */ + +static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; + +static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) +{ + return (old & APIC_EILVT_MASKED) + || (new == APIC_EILVT_MASKED) + || ((new & ~APIC_EILVT_MASKED) == old); +} + +static unsigned int reserve_eilvt_offset(int offset, unsigned int new) +{ + unsigned int rsvd, vector; + + if (offset >= APIC_EILVT_NR_MAX) + return ~0; + + rsvd = atomic_read(&eilvt_offsets[offset]); + do { + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) + /* may not change if vectors are different */ + return rsvd; + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); + + rsvd = new & ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + + return new; +} + +/* + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. Must be called with + * preemption disabled. + */ + +int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = APIC_EILVTn(offset); + unsigned int new, old, reserved; + + new = (mask << 16) | (msg_type << 8) | vector; + old = apic_read(reg); + reserved = reserve_eilvt_offset(offset, new); + + if (reserved != new) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); + return -EINVAL; + } + + if (!eilvt_entry_is_changeable(old, new)) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); + return -EBUSY; + } + + apic_write(reg, new); + + return 0; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt); + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + + tsc = rdtsc(); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + +static int lapic_timer_shutdown(struct clock_event_device *evt) +{ + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0); + return 0; +} + +static inline int +lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) +{ + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + __setup_APIC_LVTT(lapic_timer_period, oneshot, 1); + return 0; +} + +static int lapic_timer_set_periodic(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, false); +} + +static int lapic_timer_set_oneshot(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, true); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(const struct cpumask *mask) +{ +#ifdef CONFIG_SMP + apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_state_oneshot_stopped = lapic_timer_shutdown, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + + X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), + + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), + + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), + + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), + + {}, +}; + +static __init bool apic_validate_deadline_timer(void) +{ + const struct x86_cpu_id *m; + u32 rev; + + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + m = x86_match_cpu(deadline_match); + if (!m) + return true; + + rev = (u32)m->driver_data; + + if (boot_cpu_data.microcode >= rev) + return true; + + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " + "please update microcode to version: 0x%x (or later)\n", rev); + return false; +} + +/* + * Setup the local APIC timer for this CPU. Copy the initialized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (this_cpu_has(X86_FEATURE_ARAT)) { + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; + /* Make LAPIC timer preferrable over percpu HPET */ + lapic_clockevent.rating = 150; + } + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of(smp_processor_id()); + + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->name = "lapic-deadline"; + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + tsc_khz * (1000 / TSC_DIVISOR), + 0xF, ~0UL); + } else + clockevents_register_device(levt); +} + +/* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler and polled calibration function. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (boot_cpu_has(X86_FEATURE_TSC)) + tsc = rdtsc(); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init +calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + return 0; + } + + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); + + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + pr_info("APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + + /* Correct the tsc counter value */ + if (boot_cpu_has(X86_FEATURE_TSC)) { + res = (((u64)(*deltatsc)) * pm_100ms); + do_div(res, deltapm); + apic_printk(APIC_VERBOSE, "TSC delta adjusted to " + "PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); + *deltatsc = (long)res; + } + + return 0; +} + +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_period) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + +bool __init apic_needs_pit(void) +{ + /* + * If the frequencies are not known, PIT is required for both TSC + * and apic timer calibration. + */ + if (!tsc_khz || !cpu_khz) + return true; + + /* Is there an APIC at all or is it disabled? */ + if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + return true; + + /* + * If interrupt delivery mode is legacy PIC or virtual wire without + * configuration, the local APIC timer wont be set up. Make sure + * that the PIT is initialized. + */ + if (apic_intr_mode == APIC_PIC || + apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG) + return true; + + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + + /* Deadline timer is based on TSC so no further PIT action required */ + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + + /* APIC timer disabled? */ + if (disable_apic_timer) + return true; + /* + * The APIC timer frequency is known already, no PIT calibration + * required. If unknown, let the PIT be initialized. + */ + return lapic_timer_period == 0; +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; + unsigned long deltaj; + long delta, deltatsc; + int pm_referenced = 0; + + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill + * in the clockevent structure and return. + */ + if (!lapic_init_clockevent()) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_period); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ + local_irq_disable(); + + /* + * Setup the APIC counter to maximum. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(0xffffffff, 0, 0); + + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); + + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } + + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } + + local_irq_disable(); + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta, &deltatsc); + + lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + lapic_timer_period); + + if (boot_cpu_has(X86_FEATURE_TSC)) { + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (lapic_timer_period < (1000000 / HZ)) { + local_irq_enable(); + pr_warn("APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. + */ + if (!pm_referenced && global_clock_event) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_set_periodic(levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + /* Stop the lapic timer */ + local_irq_disable(); + lapic_timer_shutdown(levt); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + pr_warn("APIC timer disabled due to verification failure\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + pr_info("Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); + amd_e400_c1e_apic_setup(); +} + +void setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); + amd_e400_c1e_apic_setup(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); + /* Switch it off */ + lapic_timer_shutdown(evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + inc_irq_stat(apic_timer_irqs); + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); + local_apic_timer_interrupt(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + maxlvt = lapic_get_maxlvt(); + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * apic_soft_disable - Clears and software disables the local APIC on hotplug + * + * Contrary to disable_local_APIC() this does not touch the enable bit in + * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC + * bus would require a hardware reset as the APIC would lose track of bus + * arbitration. On systems with FSB delivery APICBASE could be disabled, + * but it has to be guaranteed that no interrupt is sent to the APIC while + * in that state and it's not clear from the SDM whether it still responds + * to INIT/SIPI messages. Stay on the safe side and use software disable. + */ +void apic_soft_disable(void) +{ + u32 value; + + clear_local_APIC(); + + /* Soft disable APIC (implies clearing of registers for 82489DX!). */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + apic_soft_disable(); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) + return; + + local_irq_save(flags); + +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + + + local_irq_restore(flags); +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); +} + +enum apic_intr_mode_id apic_intr_mode __ro_after_init; + +static int __init __apic_intr_mode_select(void) +{ + /* Check kernel option */ + if (disable_apic) { + pr_info("APIC disabled via kernel command line\n"); + return APIC_PIC; + } + + /* Check BIOS */ +#ifdef CONFIG_X86_64 + /* On 64-bit, the APIC must be integrated, Check local APIC only */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + disable_apic = 1; + pr_info("APIC disabled by BIOS\n"); + return APIC_PIC; + } +#else + /* On 32-bit, the APIC may be integrated APIC or 82489DX */ + + /* Neither 82489DX nor integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { + disable_apic = 1; + return APIC_PIC; + } + + /* If the BIOS pretends there is an integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && + APIC_INTEGRATED(boot_cpu_apic_version)) { + disable_apic = 1; + pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", + boot_cpu_physical_apicid); + return APIC_PIC; + } +#endif + + /* Check MP table or ACPI MADT configuration */ + if (!smp_found_config) { + disable_ioapic_support(); + if (!acpi_lapic) { + pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE_NO_CONFIG; + } + return APIC_VIRTUAL_WIRE; + } + +#ifdef CONFIG_SMP + /* If SMP should be disabled, then really disable it! */ + if (!setup_max_cpus) { + pr_info("APIC: SMP mode deactivated\n"); + return APIC_SYMMETRIC_IO_NO_ROUTING; + } + + if (read_apic_id() != boot_cpu_physical_apicid) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + read_apic_id(), boot_cpu_physical_apicid); + /* Or can we switch back to PIC here? */ + } +#endif + + return APIC_SYMMETRIC_IO; +} + +/* Select the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_select(void) +{ + apic_intr_mode = __apic_intr_mode_select(); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} + +static void __init apic_bsp_setup(bool upmode); + +/* Init the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_init(void) +{ + bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); + + switch (apic_intr_mode) { + case APIC_PIC: + pr_info("APIC: Keep in PIC mode(8259)\n"); + return; + case APIC_VIRTUAL_WIRE: + pr_info("APIC: Switch to virtual wire mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_VIRTUAL_WIRE_NO_CONFIG: + pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); + upmode = true; + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO: + pr_info("APIC: Switch to symmetric I/O mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO_NO_ROUTING: + pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); + break; + } + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); + + apic_bsp_setup(upmode); +} + +static void lapic_setup_esr(void) +{ + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + pr_info("No ESR for 82489DX.\n"); + return; + } + + if (apic->disable_esr) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + pr_info("Leaving ESR disabled.\n"); + return; + } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); +} + +#define APIC_IR_REGS APIC_ISR_NR +#define APIC_IR_BITS (APIC_IR_REGS * 32) +#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG) + +union apic_ir { + unsigned long map[APIC_IR_MAPSIZE]; + u32 regs[APIC_IR_REGS]; +}; + +static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +{ + int i, bit; + + /* Read the IRRs */ + for (i = 0; i < APIC_IR_REGS; i++) + irr->regs[i] = apic_read(APIC_IRR + i * 0x10); + + /* Read the ISRs */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + /* + * If the ISR map is not empty. ACK the APIC and run another round + * to verify whether a pending IRR has been unblocked and turned + * into a ISR. + */ + if (!bitmap_empty(isr->map, APIC_IR_BITS)) { + /* + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an ACK + * per set bit. + */ + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + ack_APIC_irq(); + return true; + } + + return !bitmap_empty(irr->map, APIC_IR_BITS); +} + +/* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now the CPU has serviced that pending interrupt and it + * might not have done the ack_APIC_irq() because it thought, interrupt + * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear + * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * a vector might get locked. It was noticed for timer irq (vector + * 0x31). Issue an extra EOI to clear ISR. + * + * If there are pending IRR bits they turn into ISR bits after a higher + * priority ISR bit has been acked. + */ +static void apic_pending_intr_clear(void) +{ + union apic_ir irr, isr; + unsigned int i; + + /* 512 loops are way oversized and give the APIC a chance to obey. */ + for (i = 0; i < 512; i++) { + if (!apic_check_and_ack(&irr, &isr)) + return; + } + /* Dump the IRR/ISR content if that failed */ + pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); +} + +/** + * setup_local_APIC - setup the local APIC + * + * Used to setup local APIC while initializing BSP or bringing up APs. + * Always called with preemption disabled. + */ +static void setup_local_APIC(void) +{ + int cpu = smp_processor_id(); + unsigned int value; + + if (disable_apic) { + disable_ioapic_support(); + return; + } + + /* + * If this comes from kexec/kcrash the APIC might be enabled in + * SPIV. Soft disable it before doing further initialization. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (lapic_is_integrated() && apic->disable_esr) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + BUG_ON(!apic->apic_id_registered()); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + apic->init_apic_ldr(); + +#ifdef CONFIG_X86_32 + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } +#endif + + /* + * Set Task Priority to 'accept all except vectors 0-31'. An APIC + * vector in the 16-31 range could be delivered if TPR == 0, but we + * would think it's an exception and terrible things will happen. We + * never change this later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + value |= 0x10; + apic_write(APIC_TASKPRI, value); + + /* Clear eventually stale ISR/IRR bits */ + apic_pending_intr_clear(); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + */ + + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif + + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + perf_events_lapic_init(); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the boot CPU's LINT0. This is not + * strictly necessary in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + } + apic_write(APIC_LVT0, value); + + /* + * Only the BSP sees the LINT1 NMI signal by default. This can be + * modified by apic_extnmi= boot option. + */ + if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) || + apic_extnmi == APIC_EXTNMI_ALL) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + + /* Is 82489DX ? */ + if (!lapic_is_integrated()) + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (!cpu) + cmci_recheck(); +#endif +} + +static void end_local_APIC_setup(void) +{ + lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif + + apic_pm_activate(); +} + +/* + * APIC setup function for application processors. Called from smpboot.c + */ +void apic_ap_setup(void) +{ + setup_local_APIC(); + end_local_APIC_setup(); +} + +#ifdef CONFIG_X86_X2APIC +int x2apic_mode; + +enum { + X2APIC_OFF, + X2APIC_ON, + X2APIC_DISABLED, +}; +static int x2apic_state; + +static void __x2apic_disable(void) +{ + u64 msr; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (!(msr & X2APIC_ENABLE)) + return; + /* Disable xapic and x2apic first and then reenable xapic mode */ + wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic disabled\n"); +} + +static void __x2apic_enable(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) + return; + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + printk_once(KERN_INFO "x2apic enabled\n"); +} + +static int __init setup_nox2apic(char *str) +{ + if (x2apic_enabled()) { + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + pr_warn("x2apic already enabled.\n"); + __x2apic_disable(); + } + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + x2apic_state = X2APIC_DISABLED; + x2apic_mode = 0; + return 0; +} +early_param("nox2apic", setup_nox2apic); + +/* Called from cpu_init() to enable x2apic on (secondary) cpus */ +void x2apic_setup(void) +{ + /* + * If x2apic is not in ON state, disable it if already enabled + * from BIOS. + */ + if (x2apic_state != X2APIC_ON) { + __x2apic_disable(); + return; + } + __x2apic_enable(); +} + +static __init void x2apic_disable(void) +{ + u32 x2apic_id, state = x2apic_state; + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + if (state != X2APIC_ON) + return; + + x2apic_id = read_apic_id(); + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + __x2apic_disable(); + register_lapic_address(mp_lapic_addr); +} + +static __init void x2apic_enable(void) +{ + if (x2apic_state != X2APIC_OFF) + return; + + x2apic_mode = 1; + x2apic_state = X2APIC_ON; + __x2apic_enable(); +} + +static __init void try_to_enable_x2apic(int remap_mode) +{ + if (x2apic_state == X2APIC_DISABLED) + return; + + if (remap_mode != IRQ_REMAP_X2APIC_MODE) { + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. + */ + if (!x86_init.hyper.x2apic_available()) { + pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); + x2apic_disable(); + return; + } + + /* + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannnot + * be addressed must not be brought online. + */ + x2apic_set_max_apicid(255); + x2apic_phys = 1; + } + x2apic_enable(); +} + +void __init check_x2apic(void) +{ + if (x2apic_enabled()) { + pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); + x2apic_mode = 1; + x2apic_state = X2APIC_ON; + } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { + x2apic_state = X2APIC_DISABLED; + } +} +#else /* CONFIG_X86_X2APIC */ +static int __init validate_x2apic(void) +{ + if (!apic_is_x2apic_enabled()) + return 0; + /* + * Checkme: Can we simply turn off x2apic here instead of panic? + */ + panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n"); +} +early_initcall(validate_x2apic); + +static inline void try_to_enable_x2apic(int remap_mode) { } +static inline void __x2apic_enable(void) { } +#endif /* !CONFIG_X86_X2APIC */ + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + int ret, ir_stat; + + if (skip_ioapic_setup) { + pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); + return; + } + + ir_stat = irq_remapping_prepare(); + if (ir_stat < 0 && !x2apic_supported()) + return; + + ret = save_ioapic_entries(); + if (ret) { + pr_info("Saving IO-APIC state failed: %d\n", ret); + return; + } + + local_irq_save(flags); + legacy_pic->mask_all(); + mask_ioapic_entries(); + + /* If irq_remapping_prepare() succeeded, try to enable it */ + if (ir_stat >= 0) + ir_stat = irq_remapping_enable(); + /* ir_stat contains the remap mode or an error code */ + try_to_enable_x2apic(ir_stat); + + if (ir_stat < 0) + restore_ioapic_entries(); + legacy_pic->restore_mask(); + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + pr_info("No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + return 0; +} +#else + +static int __init apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warn("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + } + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int __init apic_force_enable(unsigned long addr) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + return apic_verify(); +} + +/* + * Detect and initialize APIC + */ +static int __init detect_init_APIC(void) +{ + /* Disabled by kernel option? */ + if (disable_apic) + return -1; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 >= 15)) + break; + goto no_apic; + case X86_VENDOR_HYGON: + break; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!boot_cpu_has(X86_FEATURE_APIC)) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return -1; + } else { + if (apic_verify()) + return -1; + } + + apic_pm_activate(); + + return 0; + +no_apic: + pr_info("No local APIC present or hardware disabled\n"); + return -1; +} +#endif + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + unsigned int new_apicid; + + if (apic_validate_deadline_timer()) + pr_info("TSC deadline timer available\n"); + + if (x2apic_mode) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } + + /* If no local APIC can be found return early */ + if (!smp_found_config && detect_init_APIC()) { + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { + apic_phys = mp_lapic_addr; + + /* + * If the system has ACPI MADT tables or MP info, the LAPIC + * address is already registered. + */ + if (!acpi_lapic && !smp_found_config) + register_lapic_address(apic_phys); + } + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since apic_intr_mode_select is prepared for such + * a case and disable smp mode + */ + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + +void __init register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + if (!x2apic_mode) { + set_fixmap_nocache(FIX_APIC_BASE, address); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, address); + } + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + +/* + * Local APIC interrupts + */ + +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + * + * Also called from sysvec_spurious_apic_interrupt(). + */ +DEFINE_IDTENTRY_IRQ(spurious_interrupt) +{ + u32 v; + + trace_spurious_apic_entry(vector); + + inc_irq_stat(irq_spurious_count); + + /* + * If this is a spurious interrupt then do not acknowledge + */ + if (vector == SPURIOUS_APIC_VECTOR) { + /* See SDM vol 3 */ + pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n", + smp_processor_id()); + goto out; + } + + /* + * If it is a vectored one, verify it's set in the ISR. If set, + * acknowledge it. + */ + v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); + if (v & (1 << (vector & 0x1f))) { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + vector, smp_processor_id()); + ack_APIC_irq(); + } else { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + vector, smp_processor_id()); + } +out: + trace_spurious_apic_exit(vector); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) +{ + __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) +{ + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; + u32 v, i = 0; + + trace_error_apic_entry(ERROR_APIC_VECTOR); + + /* First tickle the hardware, only then report what went on. -- REW */ + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", + smp_processor_id(), v); + + v &= 0xff; + while (v) { + if (v & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v >>= 1; + } + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + + trace_error_apic_exit(ERROR_APIC_VECTOR); +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +static void __init connect_bsp_APIC(void) +{ +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + imcr_pic_to_apic(); + } +#endif +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + imcr_apic_to_pic(); + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, + * so the maximum of nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + +#ifdef CONFIG_SMP +/** + * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread + * @apicid: APIC ID to check + */ +bool apic_id_is_primary_thread(unsigned int apicid) +{ + u32 mask; + + if (smp_num_siblings == 1) + return true; + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + return !(apicid & mask); +} +#endif + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids, nr_logical_cpuids, apicid); + return -EINVAL; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int generic_processor_info(int apicid, int version) +{ + int cpu, max = nr_cpu_ids; + bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, + phys_cpu_present_map); + + /* + * boot_cpu_physical_apicid is designed to have the apicid + * returned by read_apic_id(), i.e, the apicid of the + * currently booting-up processor. However, on some platforms, + * it is temporarily modified by the apicid reported as BSP + * through MP table. Concretely: + * + * - arch/x86/kernel/mpparse.c: MP_processor_info() + * - arch/x86/mm/amdtopology.c: amd_numa_init() + * + * This function is executed with the modified + * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel + * parameter doesn't work to disable APs on kdump 2nd kernel. + * + * Since fixing handling of boot_cpu_physical_apicid requires + * another discussion and tests on each platform, we leave it + * for now and here we use read_apic_id() directly in this + * function, generic_processor_info(). + */ + if (disabled_cpu_apicid != BAD_APICID && + disabled_cpu_apicid != read_apic_id() && + disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + /* + * If boot cpu has not been detected yet, then only allow upto + * nr_cpu_ids - 1 processors and keep one slot free for boot cpu + */ + if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && + apicid != boot_cpu_physical_apicid) { + int thiscpu = max + disabled_cpus - 1; + + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" + " reached. Keeping one slot for boot cpu." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + if (num_processors >= nr_cpu_ids) { + int thiscpu = max + disabled_cpus; + + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return -EINVAL; + } + + if (apicid == boot_cpu_physical_apicid) { + /* + * x86_bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. + */ + cpu = 0; + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } + + /* + * Validate version + */ + if (version == 0x0) { + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; + } + + if (version != boot_cpu_apic_version) { + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + boot_cpu_apic_version, cpu, version); + } + + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; + early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; +#endif +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif + set_cpu_possible(cpu, true); + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; + + return cpu; +} + +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} + +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->native_eoi_write = (*drv)->eoi_write; + (*drv)->eoi_write = eoi_write; + } +} + +static void __init apic_bsp_up_setup(void) +{ +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +} + +/** + * apic_bsp_setup - Setup function for local apic and io-apic + * @upmode: Force UP mode (for APIC_init_uniprocessor) + */ +static void __init apic_bsp_setup(bool upmode) +{ + connect_bsp_APIC(); + if (upmode) + apic_bsp_up_setup(); + setup_local_APIC(); + + enable_IO_APIC(); + end_local_APIC_setup(); + irq_remap_enable_fault_handling(); + setup_IO_APIC(); + lapic_update_legacy_vectors(); +} + +#ifdef CONFIG_UP_LATE_INIT +void __init up_late_init(void) +{ + if (apic_intr_mode == APIC_PIC) + return; + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); +} +#endif + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; + unsigned int apic_cmci; +} apic_pm_state; + +static int lapic_suspend(void) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI); +#endif + + local_irq_save(flags); + + /* + * Mask IOAPIC before disabling the local APIC to prevent stale IRR + * entries on some implementations. + */ + mask_ioapic_entries(); + + disable_local_APIC(); + + irq_remapping_disable(); + + local_irq_restore(flags); + return 0; +} + +static void lapic_resume(void) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return; + + local_irq_save(flags); + + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); + legacy_pic->mask_all(); + + if (x2apic_mode) { + __x2apic_enable(); + } else { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + } + + maxlvt = lapic_get_maxlvt(); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) + apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + irq_remapping_reenable(x2apic_mode); + + local_irq_restore(flags); +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct syscore_ops lapic_syscore_ops = { + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static void apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (boot_cpu_has(X86_FEATURE_APIC)) + register_syscore_ops(&lapic_syscore_ops); + + return 0; +} + +/* local apic needs to resume before other devices access its registers. */ +core_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +#ifdef CONFIG_X86_64 + +static int multi_checked; +static int multi; + +static int set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +int apic_is_clustered_box(void) +{ + dmi_check_multi(); + return multi; +} +#endif + +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) +{ + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 + else { + pr_warn("APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } +#endif + + return 0; +} +early_param("apic", apic_set_verbosity); + +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820__reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); + +static int __init apic_set_extnmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strncmp("all", arg, 3)) + apic_extnmi = APIC_EXTNMI_ALL; + else if (!strncmp("none", arg, 4)) + apic_extnmi = APIC_EXTNMI_NONE; + else if (!strncmp("bsp", arg, 3)) + apic_extnmi = APIC_EXTNMI_BSP; + else { + pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic_extnmi", apic_set_extnmi); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c new file mode 100644 index 000000000..02b483947 --- /dev/null +++ b/arch/x86/kernel/apic/apic_common.c @@ -0,0 +1,46 @@ +/* + * Common functions shared between the various APIC flavours + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include +#include + +u32 apic_default_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_apicid, cpu); +} + +u32 apic_flat_calc_apicid(unsigned int cpu) +{ + return 1U << cpu; +} + +bool default_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + *retmap = *phys_map; +} + +int default_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} +EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); + +int default_check_phys_apicid_present(int phys_apicid) +{ + return physid_isset(phys_apicid, phys_cpu_present_map); +} + +int default_apic_id_valid(u32 apicid) +{ + return (apicid < 255); +} diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c new file mode 100644 index 000000000..7862b152a --- /dev/null +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2004 James Cleverdon, IBM. + * + * Flat APIC subarch code. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include + +#include +#include + +#include "local.h" + +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic *apic __ro_after_init = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +void flat_init_apic_ldr(void) +{ + unsigned long val; + unsigned long num, id; + + num = smp_processor_id(); + id = 1UL << num; + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write(APIC_LDR, val); +} + +static void _flat_send_IPI_mask(unsigned long mask, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + local_irq_restore(flags); +} + +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + __clear_bit(cpu, &mask); + + _flat_send_IPI_mask(mask, vector); +} + +static unsigned int flat_get_apic_id(unsigned long x) +{ + return (x >> 24) & 0xFF; +} + +static u32 set_apic_id(unsigned int id) +{ + return (id & 0xFF) << 24; +} + +static unsigned int read_xapic_id(void) +{ + return flat_get_apic_id(apic_read(APIC_ID)); +} + +static int flat_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int flat_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static int flat_probe(void) +{ + return 1; +} + +static struct apic apic_flat __ro_after_init = { + .name = "flat", + .probe = flat_probe, + .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = flat_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 1, /* logical */ + + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = flat_phys_pkg_id, + + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = default_send_IPI_single, + .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; + +/* + * Physflat mode is used when there are more than 8 CPUs on a system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); + return 1; + } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } +#endif + + return 0; +} + +static void physflat_init_apic_ldr(void) +{ + /* + * LDR and DFR are not involved in physflat mode, rather: + * "In physical destination mode, the destination processor is + * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) + */ +} + +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8 || + jailhouse_paravirt()) + return 1; + + return 0; +} + +static struct apic apic_physflat __ro_after_init = { + + .name = "physical flat", + .probe = physflat_probe, + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = flat_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .init_apic_ldr = physflat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = flat_phys_pkg_id, + + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000..780c70296 --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + */ +#include +#include + +#include + +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI(int cpu, int vector) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + return -1; +} + +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} + +static int noop_probe(void) +{ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); +} + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); + return 0; +} + +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); +} + +#ifdef CONFIG_X86_32 +static int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} +#endif + +struct apic apic_noop __ro_after_init = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = default_check_apicid_used, + + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = NULL, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + + .check_phys_apicid_present = default_check_phys_apicid_present, + + .phys_pkg_id = noop_phys_pkg_id, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = noop_send_IPI, + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .eoi_write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, +#endif +}; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 000000000..35edd57f0 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,338 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to + * + */ +#include +#include +#include + +#include +#include + + +#include "local.h" + +u8 numachip_system __read_mostly; +static const struct apic apic_numachip1; +static const struct apic apic_numachip2; +static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly; + +static unsigned int numachip1_get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id = (x >> 24) & 0xff; + + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, value); + id |= (value << 2) & 0xff00; + } + + return id; +} + +static u32 numachip1_set_apic_id(unsigned int id) +{ + return (id & 0xff) << 24; +} + +static unsigned int numachip2_get_apic_id(unsigned long x) +{ + u64 mcfg; + + rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); +} + +static u32 numachip2_set_apic_id(unsigned int id) +{ + return id << 24; +} + +static int numachip_apic_id_valid(u32 apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + +static int numachip_apic_id_registered(void) +{ + return 1; +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static void numachip1_apic_icr_write(int apicid, unsigned int val) +{ + write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); +} + +static void numachip2_apic_icr_write(int apicid, unsigned int val) +{ + numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); +} + +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); + numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | + (start_rip >> 12)); + + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned int dmode; + + preempt_disable(); + local_apicid = __this_cpu_read(x86_cpu_to_apicid); + + /* Send via local APIC where non-local part matches */ + if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) { + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(apicid, vector, + APIC_DEST_PHYSICAL); + local_irq_restore(flags); + preempt_enable(); + return; + } + preempt_enable(); + + dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED; + numachip_apic_icr_write(apicid, dmode | vector); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static int __init numachip1_probe(void) +{ + return apic == &apic_numachip1; +} + +static int __init numachip2_probe(void) +{ + return apic == &apic_numachip2; +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + u64 val; + u32 nodes = 1; + + this_cpu_write(cpu_llc_id, node); + + /* Account for nodes per socket in multi-core-module processors */ + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, val); + nodes = ((val >> 3) & 7) + 1; + } + + c->phys_proc_id = node / nodes; +} + +static int __init numachip_system_init(void) +{ + /* Map the LCSR area and set up the apic_icr_write function */ + switch (numachip_system) { + case 1: + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + numachip_apic_icr_write = numachip1_apic_icr_write; + break; + case 2: + init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE); + numachip_apic_icr_write = numachip2_apic_icr_write; + break; + default: + return 0; + } + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONNECT", 8) != 0)) + return 0; + + numachip_system = 1; + + return 1; +} + +static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if ((strncmp(oem_id, "NUMASC", 6) != 0) || + (strncmp(oem_table_id, "NCONECT2", 8) != 0)) + return 0; + + numachip_system = 2; + + return 1; +} + +/* APIC IPIs are queued */ +static void numachip_apic_wait_icr_idle(void) +{ +} + +/* APIC NMI IPIs are queued */ +static u32 numachip_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static const struct apic apic_numachip1 __refconst = { + .name = "NumaConnect system", + .probe = numachip1_probe, + .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = numachip_phys_pkg_id, + + .get_apic_id = numachip1_get_apic_id, + .set_apic_id = numachip1_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = numachip_send_IPI_one, + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, +}; + +apic_driver(apic_numachip1); + +static const struct apic apic_numachip2 __refconst = { + .name = "NumaConnect2 system", + .probe = numachip2_probe, + .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = numachip_phys_pkg_id, + + .get_apic_id = numachip2_get_apic_id, + .set_apic_id = numachip2_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = numachip_send_IPI_one, + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = numachip_apic_wait_icr_idle, + .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, +}; + +apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c new file mode 100644 index 000000000..98d015a44 --- /dev/null +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. + * + * Drives the local APIC in "clustered mode". + */ +#include +#include +#include + +#include +#include + +#include "local.h" + +static unsigned bigsmp_get_apic_id(unsigned long x) +{ + return (x >> 24) & 0xFF; +} + +static int bigsmp_apic_id_registered(void) +{ + return 1; +} + +static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +{ + return false; +} + +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + +/* + * bigsmp enables physical destination mode + * and doesn't use LDR and DFR + */ +static void bigsmp_init_apic_ldr(void) +{ +} + +static void bigsmp_setup_apic_routing(void) +{ + printk(KERN_INFO + "Enabling APIC mode: Physflat. Using %d I/O APICs\n", + nr_ioapics); +} + +static int bigsmp_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + + return BAD_APICID; +} + +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + /* For clustered we don't have a good way to do this yet - hack */ + physids_promote(0xFFL, retmap); +} + +static int bigsmp_check_phys_apicid_present(int phys_apicid) +{ + return 1; +} + +static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +static void bigsmp_send_IPI_allbutself(int vector) +{ + default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void bigsmp_send_IPI_all(int vector) +{ + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); +} + +static int dmi_bigsmp; /* can be set by dmi scanners */ + +static int hp_ht_bigsmp(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); + dmi_bigsmp = 1; + + return 0; +} + + +static const struct dmi_system_id bigsmp_dmi_table[] = { + { hp_ht_bigsmp, "HP ProLiant DL760 G2", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P44-"), + } + }, + + { hp_ht_bigsmp, "HP ProLiant DL740", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P47-"), + } + }, + { } /* NULL entry stops DMI scanning */ +}; + +static int probe_bigsmp(void) +{ + if (def_to_bigsmp) + dmi_bigsmp = 1; + else + dmi_check_system(bigsmp_dmi_table); + + return dmi_bigsmp; +} + +static struct apic apic_bigsmp __ro_after_init = { + + .name = "bigsmp", + .probe = probe_bigsmp, + .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = bigsmp_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + /* phys delivery to target CPU: */ + .irq_dest_mode = 0, + + .disable_esr = 1, + .dest_logical = 0, + .check_apicid_used = bigsmp_check_apicid_used, + + .init_apic_ldr = bigsmp_init_apic_ldr, + + .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, + .setup_apic_routing = bigsmp_setup_apic_routing, + .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + .check_phys_apicid_present = bigsmp_check_phys_apicid_present, + .phys_pkg_id = bigsmp_phys_pkg_id, + + .get_apic_id = bigsmp_get_apic_id, + .set_apic_id = NULL, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = default_send_IPI_single_phys, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = NULL, + .send_IPI_allbutself = bigsmp_send_IPI_allbutself, + .send_IPI_all = bigsmp_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, +}; + +void __init generic_bigsmp_probe(void) +{ + unsigned int cpu; + + if (!probe_bigsmp()) + return; + + apic = &apic_bigsmp; + + for_each_possible_cpu(cpu) { + if (early_per_cpu(x86_cpu_to_logical_apicid, + cpu) == BAD_APICID) + continue; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + bigsmp_early_logical_apicid(cpu); + } + + pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000..34a992e27 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + return (u64)(cpu_khz) * 1000 * watchdog_thresh; +} +#endif + +#ifdef arch_trigger_cpumask_backtrace +static void nmi_raise_cpu_backtrace(cpumask_t *mask) +{ + apic->send_IPI_mask(mask, NMI_VECTOR); +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_raise_cpu_backtrace); +} + +static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +{ + if (nmi_cpu_backtrace(regs)) + return NMI_HANDLED; + + return NMI_DONE; +} +NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler); + +static int __init register_nmi_cpu_backtrace_handler(void) +{ + register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler, + 0, "arch_bt"); + return 0; +} +early_initcall(register_nmi_cpu_backtrace_handler); +#endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c new file mode 100644 index 000000000..74794387b --- /dev/null +++ b/arch/x86/kernel/apic/io_apic.c @@ -0,0 +1,3088 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define for_each_ioapic(idx) \ + for ((idx) = 0; (idx) < nr_ioapics; (idx)++) +#define for_each_ioapic_reverse(idx) \ + for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--) +#define for_each_pin(idx, pin) \ + for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++) +#define for_each_ioapic_pin(idx, pin) \ + for_each_ioapic((idx)) \ + for_each_pin((idx), (pin)) +#define for_each_irq_pin(entry, head) \ + list_for_each_entry(entry, &head, list) + +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_MUTEX(ioapic_mutex); +static unsigned int ioapic_dynirq_base; +static int ioapic_initialized; + +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + +struct mp_chip_data { + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + int trigger; + int polarity; + u32 count; + bool isa_irq; +}; + +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; +}; + +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct resource *iomem_res; +} ioapics[MAX_IO_APICS]; + +#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver + +int mpc_ioapic_id(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicid; +} + +unsigned int mpc_ioapic_addr(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicaddr; +} + +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +{ + return &ioapics[ioapic_idx].gsi_config; +} + +static inline int mp_ioapic_pin_count(int ioapic) +{ + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; +} + +static inline u32 mp_pin_to_gsi(int ioapic, int pin) +{ + return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; +} + +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + +static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) +{ + return ioapics[ioapic].irqdomain; +} + +int nr_ioapics; + +/* The one past the highest gsi number used */ +u32 gsi_top; + +/* MP IRQ source entries */ +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#ifdef CONFIG_EISA +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + skip_ioapic_setup = 1; +} + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_support(); + return 0; +} +early_param("noapic", parse_noapic); + +/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +void mp_save_irq(struct mpc_intsrc *m) +{ + int i; + + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); + + for (i = 0; i < mp_irq_entries; i++) { + if (!memcmp(&mp_irqs[i], m, sizeof(*m))) + return; + } + + memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void alloc_ioapic_saved_registers(int idx) +{ + size_t size; + + if (ioapics[idx].saved_registers) + return; + + size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers; + ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL); + if (!ioapics[idx].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", idx); +} + +static void free_ioapic_saved_registers(int idx) +{ + kfree(ioapics[idx].saved_registers); + ioapics[idx].saved_registers = NULL; +} + +int __init arch_early_ioapic_init(void) +{ + int i; + + if (!nr_legacy_irqs()) + io_apic_irqs = ~0UL; + + for_each_ioapic(i) + alloc_ioapic_saved_registers(i); + + return 0; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); +} + +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} + +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + + return eu.entry; +} + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + eu.entry = __ioapic_read_entry(apic, pin); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu = {{0, 0}}; + + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) +{ + struct irq_pin_list *entry; + + /* don't allow duplicates */ + for_each_irq_pin(entry, data->irq_2_pin) + if (entry->apic == apic && entry->pin == pin) + return 0; + + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); + if (!entry) { + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } + entry->apic = apic; + entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); + + return 0; +} + +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) +{ + struct irq_pin_list *tmp, *entry; + + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) + if (entry->apic == apic && entry->pin == pin) { + list_del(&entry->list); + kfree(entry); + return; + } +} + +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) +{ + if (__add_pin_to_irq_node(data, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + /* every one is different, right? */ + return; + } + } + + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(data, node, newapic, newpin); +} + +static void io_apic_modify_irq(struct mp_chip_data *data, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + union entry_union eu; + struct irq_pin_list *entry; + + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } +} + +static void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void mask_ioapic_irq(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void __unmask_ioapic(struct mp_chip_data *data) +{ + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); +} + +static void unmask_ioapic_irq(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __unmask_ioapic(data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. + */ +static void __eoi_ioapic_pin(int apic, int pin, int vector) +{ + if (mpc_ioapic_ver(apic) >= 0x20) { + io_apic_eoi(apic, vector); + } else { + struct IO_APIC_route_entry entry, entry1; + + entry = entry1 = __ioapic_read_entry(apic, pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + entry1.mask = IOAPIC_MASKED; + entry1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(apic, pin, entry1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, entry); + } +} + +static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) +{ + unsigned long flags; + struct irq_pin_list *entry; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + unsigned long flags; + + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (entry.trigger == IOAPIC_EDGE) { + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + raw_spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_pin(apic, pin, entry.vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + } + + /* + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. + */ + ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", + mpc_ioapic_id(apic), pin); +} + +void clear_IO_APIC (void) +{ + int apic, pin; + + for_each_ioapic_pin(apic, pin) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries[MAX_PIRQS] = { + [0 ... MAX_PIRQS - 1] = -1 +}; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +/* + * Saves all the IO-APIC RTE's + */ +int save_ioapic_entries(void) +{ + int apic, pin; + int err = 0; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } + + for_each_pin(apic, pin) + ioapics[apic].saved_registers[pin] = + ioapic_read_entry(apic, pin); + } + + return err; +} + +/* + * Mask all IO APIC entries. + */ +void mask_ioapic_entries(void) +{ + int apic, pin; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) + continue; + + for_each_pin(apic, pin) { + struct IO_APIC_route_entry entry; + + entry = ioapics[apic].saved_registers[pin]; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + +/* + * Restore IO APIC entries which was saved in the ioapic structure. + */ +int restore_ioapic_entries(void) +{ + int apic, pin; + + for_each_ioapic(apic) { + if (!ioapics[apic].saved_registers) + continue; + + for_each_pin(apic, pin) + ioapic_write_entry(apic, pin, + ioapics[apic].saved_registers[pin]); + } + return 0; +} + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int ioapic_idx, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].irqtype == type && + (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || + mp_irqs[i].dstapic == MP_APIC_ALL) && + mp_irqs[i].dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + + return mp_irqs[i].dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + break; + } + + if (i < mp_irq_entries) { + int ioapic_idx; + + for_each_ioapic(ioapic_idx) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) + return ioapic_idx; + } + + return -1; +} + +#ifdef CONFIG_EISA +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always active high edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always active low level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) + +static int irq_polarity(int idx) +{ + int bus = mp_irqs[idx].srcbus; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { + case MP_IRQPOL_DEFAULT: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case MP_IRQPOL_ACTIVE_HIGH: + return IOAPIC_POL_HIGH; + case MP_IRQPOL_RESERVED: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + fallthrough; + case MP_IRQPOL_ACTIVE_LOW: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; + } +} + +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; +} +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif + +static int irq_trigger(int idx) +{ + int bus = mp_irqs[idx].srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { + case MP_IRQTRIG_DEFAULT: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case MP_IRQTRIG_EDGE: + return IOAPIC_EDGE; + case MP_IRQTRIG_RESERVED: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + fallthrough; + case MP_IRQTRIG_LEVEL: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; + } +} + +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) +{ + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic.node = node; + info->ioapic.trigger = trigger; + info->ioapic.polarity = polarity; + info->ioapic.valid = 1; +} + +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); +#endif + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->devid = mpc_ioapic_id(ioapic_idx); + dst->ioapic.pin = pin; + dst->ioapic.valid = 1; + if (src && src->ioapic.valid) { + dst->ioapic.node = src->ioapic.node; + dst->ioapic.trigger = src->ioapic.trigger; + dst->ioapic.polarity = src->ioapic.polarity; + } else { + dst->ioapic.node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic.trigger = trigger; + dst->ioapic.polarity = polarity; + } else { + /* + * PCI interrupts are always active low level + * triggered. + */ + dst->ioapic.trigger = IOAPIC_LEVEL; + dst->ioapic.polarity = IOAPIC_POL_LOW; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; +} + +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic.trigger != data->trigger) + mp_register_handler(irq, info->ioapic.trigger); + data->entry.trigger = data->trigger = info->ioapic.trigger; + data->entry.polarity = data->polarity = info->ioapic.polarity; + } + + return data->trigger == info->ioapic.trigger && + data->polarity == info->ioapic.polarity; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, + struct irq_alloc_info *info) +{ + bool legacy = false; + int irq = -1; + int type = ioapics[ioapic].irqdomain_cfg.type; + + switch (type) { + case IOAPIC_DOMAIN_LEGACY: + /* + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. + */ + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) + irq = gsi; + legacy = mp_is_legacy_irq(irq); + break; + case IOAPIC_DOMAIN_STRICT: + irq = gsi; + break; + case IOAPIC_DOMAIN_DYNAMIC: + break; + default: + WARN(1, "ioapic: unknown irqdomain type %d\n", type); + return -1; + } + + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy, NULL); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic.pin)) + return -ENOMEM; + } else { + info->flags |= X86_IRQ_ALLOC_LEGACY; + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } + } + + return irq; +} + +static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, + unsigned int flags, struct irq_alloc_info *info) +{ + int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (!domain) + return -ENOSYS; + + if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { + irq = mp_irqs[idx].srcbusirq; + legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; + } + + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); + if (irq == 0) + irq = -ENOENT; + } + } else { + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } + } + mutex_unlock(&ioapic_mutex); + + return irq; +} + +static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) +{ + u32 gsi = mp_pin_to_gsi(ioapic, pin); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + int irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + return irq; + } + } + } +#endif + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); +} + +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) +{ + int ioapic, pin, idx; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -ENODEV; + + pin = mp_find_ioapic_pin(ioapic, gsi); + idx = find_irq_entry(ioapic, pin, mp_INT); + if ((flags & IOAPIC_MAP_CHECK) && idx < 0) + return -ENODEV; + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); +} + +void mp_unmap_irq(int irq) +{ + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; + + if (!irq_data || !irq_data->domain) + return; + + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; + + mutex_lock(&ioapic_mutex); + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); + mutex_unlock(&ioapic_mutex); +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int irq, i, best_ioapic = -1, best_idx = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + int ioapic_idx, found = 0; + + if (bus != lbus || mp_irqs[i].irqtype != mp_INT || + slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f)) + continue; + + for_each_ioapic(ioapic_idx) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) { + found = 1; + break; + } + if (!found) + continue; + + /* Skip ISA IRQs */ + irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0); + if (irq > 0 && !IO_APIC_IRQ(irq)) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + best_idx = i; + best_ioapic = ioapic_idx; + goto out; + } + + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_idx < 0) { + best_idx = i; + best_ioapic = ioapic_idx; + } + } + if (best_idx < 0) + return -1; + +out: + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, + IOAPIC_MAP_ALLOC); +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +static struct irq_chip ioapic_chip, ioapic_ir_chip; + +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int ioapic, pin; + int idx; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for_each_ioapic_pin(ioapic, pin) { + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + else + pin_2_irq(idx, ioapic, pin, + ioapic ? 0 : IOAPIC_MAP_ALLOC); + } +} + +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index2 << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", + entry.dest, entry.delivery_mode); + } +} + +static void __init print_IO_APIC(int ioapic_idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %02X\n", + reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %02X\n", + reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); +} + +void __init print_IO_APICs(void) +{ + int ioapic_idx; + unsigned int irq; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for_each_ioapic(ioapic_idx) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), + ioapics[ioapic_idx].nr_registers); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for_each_ioapic(ioapic_idx) + print_IO_APIC(ioapic_idx); + + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_active_irq(irq) { + struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; + + chip = irq_get_chip(irq); + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) + continue; + data = irq_get_chip_data(irq); + if (!data) + continue; + if (list_empty(&data->irq_2_pin)) + continue; + + printk(KERN_DEBUG "IRQ%d ", irq); + for_each_irq_pin(entry, data->irq_2_pin) + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); +} + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + int i8259_apic, i8259_pin; + int apic, pin; + + if (skip_ioapic_setup) + nr_ioapics = 0; + + if (!nr_legacy_irqs() || !nr_ioapics) + return; + + for_each_ioapic_pin(apic, pin) { + /* See if any of the pins is in ExtINT mode */ + struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +void native_restore_boot_irq_mode(void) +{ + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +void restore_boot_irq_mode(void) +{ + if (!nr_legacy_irqs()) + return; + + x86_apic_ops.restore(); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ +void __init setup_ioapic_ids_from_mpc_nocheck(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int ioapic_idx; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for_each_ioapic(ioapic_idx) { + /* Read the register 0 value */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mpc_ioapic_id(ioapic_idx); + + if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&phys_id_present_map, + mpc_ioapic_id(ioapic_idx))) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + ioapics[ioapic_idx].mp_config.apicid = i; + } else { + physid_mask_t tmp; + apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), + &tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mpc_ioapic_id(ioapic_idx)) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].dstapic == old_id) + mp_irqs[i].dstapic + = mpc_ioapic_id(ioapic_idx); + + /* + * Update the ID register according to the right value + * from the MPC table if they are different. + */ + if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) + continue; + + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); + + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_idx, 0, reg_00.raw); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) + pr_cont("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(boot_cpu_apic_version)) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +static void __init delay_with_tsc(void) +{ + unsigned long long start, now; + unsigned long end = jiffies + 4; + + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 40000000000/HZ TSC cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 40000000000ULL / HZ && + time_before_eq(jiffies, end)); +} + +static void __init delay_without_tsc(void) +{ + unsigned long end = jiffies + 4; + int band = 1; + + /* + * We don't know any frequency yet, but waiting for + * 40940000000/HZ cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094 + */ + do { + __delay(((1U << band++) * 10000000UL) / HZ); + } while (band < 12 && time_before_eq(jiffies, end)); +} + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + + if (boot_cpu_has(X86_FEATURE_TSC)) + delay_with_tsc(); + else + delay_without_tsc(); + + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_ioapic_irq(struct irq_data *data) +{ + int was_pending = 0, irq = data->irq; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (irq < nr_legacy_irqs()) { + legacy_pic->mask(irq); + if (legacy_pic->irq_pending(irq)) + was_pending = 1; + } + __unmask_ioapic(data->chip_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +atomic_t irq_mis_count; + +#ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct mp_chip_data *data) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + +static inline bool ioapic_prepare_move(struct irq_data *data) +{ + /* If we are moving the IRQ we need to mask it */ + if (unlikely(irqd_is_setaffinity_pending(data))) { + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); + return true; + } + return false; +} + +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) +{ + if (unlikely(moveit)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(data->chip_data)) + irq_move_masked_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); + } +} +#else +static inline bool ioapic_prepare_move(struct irq_data *data) +{ + return false; +} +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) +{ +} +#endif + +static void ioapic_ack_level(struct irq_data *irq_data) +{ + struct irq_cfg *cfg = irqd_cfg(irq_data); + unsigned long v; + bool moveit; + int i; + + irq_complete_move(cfg); + moveit = ioapic_prepare_move(irq_data); + + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. + */ + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); + } + + ioapic_finish_move(irq_data, moveit); +} + +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + apic_ack_irq(irq_data); + eoi_ioapic_pin(data->entry.vector, data); +} + +static void ioapic_configure_entry(struct irq_data *irqd) +{ + struct mp_chip_data *mpd = irqd->chip_data; + struct irq_cfg *cfg = irqd_cfg(irqd); + struct irq_pin_list *entry; + + /* + * Only update when the parent is the vector domain, don't touch it + * if the parent is the remapping domain. Check the installed + * ioapic chip to verify that. + */ + if (irqd->chip == &ioapic_chip) { + mpd->entry.dest = cfg->dest_apicid; + mpd->entry.vector = cfg->vector; + } + for_each_irq_pin(entry, mpd->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) + ioapic_configure_entry(irq_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +/* + * Interrupt shutdown masks the ioapic pin, but the interrupt might already + * be in flight, but not yet serviced by the target CPU. That means + * __synchronize_hardirq() would return and claim that everything is calmed + * down. So free_irq() would proceed and deactivate the interrupt and free + * resources. + * + * Once the target CPU comes around to service it it will find a cleared + * vector and complain. While the spurious interrupt is harmless, the full + * release of resources might prevent the interrupt from being acknowledged + * which keeps the hardware in a weird state. + * + * Verify that the corresponding Remote-IRR bits are clear. + */ +static int ioapic_irq_get_chip_state(struct irq_data *irqd, + enum irqchip_irq_state which, + bool *state) +{ + struct mp_chip_data *mcd = irqd->chip_data; + struct IO_APIC_route_entry rentry; + struct irq_pin_list *p; + + if (which != IRQCHIP_STATE_ACTIVE) + return -EINVAL; + + *state = false; + raw_spin_lock(&ioapic_lock); + for_each_irq_pin(p, mcd->irq_2_pin) { + rentry = __ioapic_read_entry(p->apic, p->pin); + /* + * The remote IRR is only valid in level trigger mode. It's + * meaning is undefined for edge triggered interrupts and + * irrelevant because the IO-APIC treats them as fire and + * forget. + */ + if (rentry.irr && rentry.trigger) { + *state = true; + break; + } + } + raw_spin_unlock(&ioapic_lock); + return 0; +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static inline void init_IO_APIC_traps(void) +{ + struct irq_cfg *cfg; + unsigned int irq; + + for_each_active_irq(irq) { + cfg = irq_cfg(irq); + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < nr_legacy_irqs()) + legacy_pic->make_irq(irq); + else + /* Strange. Oh, well.. */ + irq_set_chip(irq, &no_irq_chip); + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq(struct irq_data *data) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + irq_clear_status_flags(irq, IRQ_LEVEL); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = IOAPIC_EDGE; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + struct irq_alloc_info info; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.devid = mpc_ioapic_id(ioapic); + info.ioapic.pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); + int node = cpu_to_node(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + int no_pin1 = 0; + + if (!global_clock_event) + return; + + local_irq_save(flags); + + /* + * get/set the timer IRQ vector: + */ + legacy_pic->mask(0); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + legacy_pic->init(1); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* Ok, does IRQ0 through the IOAPIC work? */ + if (no_pin1) { + mp_alloc_timer_irq(apic1, pin1); + } else { + /* + * for edge trigger, it's already unmasked, + * so only need to unmask if it is level-trigger + * do we really have level trigger timer? + */ + int idx; + idx = find_irq_entry(apic1, pin1, mp_INT); + if (idx != -1 && irq_trigger(idx)) + unmask_ioapic_irq(irq_get_irq_data(0)); + } + irq_domain_deactivate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); + if (timer_irq_works()) { + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } + panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); + local_irq_disable(); + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_deactivate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); + legacy_pic->unmask(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + goto out; + } + /* + * Cleanup, just in case ... + */ + local_irq_disable(); + legacy_pic->mask(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + legacy_pic->unmask(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + legacy_pic->mask(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + legacy_pic->init(0); + legacy_pic->make_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (apic_is_x2apic_enabled()) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1UL << PIC_CASCADE_IR) + +static int mp_irqdomain_create(int ioapic) +{ + struct irq_alloc_info info; + struct irq_domain *parent; + int hwirqs = mp_ioapic_pin_count(ioapic); + struct ioapic *ip = &ioapics[ioapic]; + struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct fwnode_handle *fn; + char *name = "IO-APIC"; + + if (cfg->type == IOAPIC_DOMAIN_INVALID) + return 0; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; + info.devid = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + else + name = "IO-APIC-IR"; + + /* Handle device tree enumerated APICs proper */ + if (cfg->dev) { + fn = of_node_to_fwnode(cfg->dev); + } else { + fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + if (!fn) + return -ENOMEM; + } + + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, + (void *)(long)ioapic); + + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENOMEM; + } + + ip->irqdomain->parent = parent; + + if (cfg->type == IOAPIC_DOMAIN_LEGACY || + cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, + gsi_cfg->gsi_end + 1); + + return 0; +} + +static void ioapic_destroy_irqdomain(int idx) +{ + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + + if (ioapics[idx].irqdomain) { + irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); + ioapics[idx].irqdomain = NULL; + } +} + +void __init setup_IO_APIC(void) +{ + int ioapic; + + if (skip_ioapic_setup || !nr_ioapics) + return; + + io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + for_each_ioapic(ioapic) + BUG_ON(mp_irqdomain_create(ioapic)); + + /* + * Set up IO-APIC IRQ routing. + */ + x86_init.mpparse.setup_ioapic_ids(); + + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + if (nr_legacy_irqs()) + check_timer(); + + ioapic_initialized = 1; +} + +static void resume_ioapic_id(int ioapic_idx) +{ + unsigned long flags; + union IO_APIC_reg_00 reg_00; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + io_apic_write(ioapic_idx, 0, reg_00.raw); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void ioapic_resume(void) +{ + int ioapic_idx; + + for_each_ioapic_reverse(ioapic_idx) + resume_ioapic_id(ioapic_idx); + + restore_ioapic_entries(); +} + +static struct syscore_ops ioapic_syscore_ops = { + .suspend = save_ioapic_entries, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_ops(void) +{ + register_syscore_ops(&ioapic_syscore_ops); + + return 0; +} + +device_initcall(ioapic_init_ops); + +static int io_apic_get_redir_entries(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* The register returns the maximum index redir index + * supported, which is one less than the total number of redir + * entries. + */ + return reg_01.bits.entries + 1; +} + +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + unsigned int ret; + + /* + * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use + * gsi_top if ioapic_dynirq_base hasn't been initialized yet. + */ + ret = ioapic_dynirq_base ? : gsi_top; + + /* + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. + */ + return ret ? : from; +} + +#ifdef CONFIG_X86_32 +static int io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!apic->check_apicid_used(&apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + apic->apicid_to_cpu_present(apic_id, &tmp); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +static u8 io_apic_unique_id(int idx, u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(boot_cpu_apic_version)) + return io_apic_get_unique_id(idx, id); + else + return id; +} +#else +static u8 io_apic_unique_id(int idx, u8 id) +{ + union IO_APIC_reg_00 reg_00; + DECLARE_BITMAP(used, 256); + unsigned long flags; + u8 new_id; + int i; + + bitmap_zero(used, 256); + for_each_ioapic(i) + __set_bit(mpc_ioapic_id(i), used); + + /* Hand out the requested id if available */ + if (!test_bit(id, used)) + return id; + + /* + * Read the current id from the ioapic and keep it if + * available. + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + new_id = reg_00.bits.ID; + if (!test_bit(new_id, used)) { + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); + return new_id; + } + + /* + * Get the next free id and write it to the ioapic. + */ + new_id = find_first_zero_bit(used, 256); + reg_00.bits.ID = new_id; + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + /* Sanity check */ + BUG_ON(reg_00.bits.ID != new_id); + + return new_id; +} +#endif + +static int io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_trigger(idx); + *polarity = irq_polarity(idx); + return 0; +} + +/* + * This function updates target affinity of IOAPIC interrupts to include + * the CPUs which came online during SMP bringup. + */ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics == 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = memblock_alloc(n, SMP_CACHE_BYTES); + if (!mem) + panic("%s: Failed to allocate %lu bytes\n", __func__, n); + res = (void *)mem; + + mem += sizeof(struct resource) * nr_ioapics; + + for_each_ioapic(i) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + ioapics[i].iomem_res = &res[i]; + } + + ioapic_resources = res; + + return res; +} + +void __init io_apic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for_each_ioapic(i) { + if (smp_found_config) { + ioapic_phys = mpc_ioapic_addr(i); +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + PAGE_SIZE); + if (!ioapic_phys) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); + idx++; + + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; + ioapic_res++; + } +} + +void __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + if (nr_ioapics > 0) + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return; + } + + for_each_ioapic(i) { + insert_resource(&iomem_resource, r); + r++; + } +} + +int mp_find_ioapic(u32 gsi) +{ + int i; + + if (nr_ioapics == 0) + return -1; + + /* Find the IOAPIC that manages this GSI. */ + for_each_ioapic(i) { + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, u32 gsi) +{ + struct mp_ioapic_gsi *gsi_cfg; + + if (WARN_ON(ioapic < 0)) + return -1; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) + return -1; + + return gsi - gsi_cfg->gsi_base; +} + +static int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + +static int find_free_ioapic_entry(void) +{ + int idx; + + for (idx = 0; idx < MAX_IO_APICS; idx++) + if (ioapics[idx].nr_registers == 0) + return idx; + + return MAX_IO_APICS; +} + +/** + * mp_register_ioapic - Register an IOAPIC device + * @id: hardware IOAPIC ID + * @address: physical address of IOAPIC register area + * @gsi_base: base of GSI associated with the IOAPIC + * @cfg: configuration information for the IOAPIC + */ +int mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg) +{ + bool hotplug = !!ioapic_initialized; + struct mp_ioapic_gsi *gsi_cfg; + int idx, ioapic, entries; + u32 gsi_end; + + if (!address) { + pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); + return -EINVAL; + } + for_each_ioapic(ioapic) + if (ioapics[ioapic].mp_config.apicaddr == address) { + pr_warn("address 0x%x conflicts with IOAPIC%d\n", + address, ioapic); + return -EEXIST; + } + + idx = find_free_ioapic_entry(); + if (idx >= MAX_IO_APICS) { + pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, idx); + return -ENOSPC; + } + + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENODEV; + } + + ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + entries = io_apic_get_redir_entries(idx); + gsi_end = gsi_base + entries - 1; + for_each_ioapic(ioapic) { + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if ((gsi_base >= gsi_cfg->gsi_base && + gsi_base <= gsi_cfg->gsi_end) || + (gsi_end >= gsi_cfg->gsi_base && + gsi_end <= gsi_cfg->gsi_end)) { + pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", + gsi_base, gsi_end, + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOSPC; + } + } + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_end; + + ioapics[idx].irqdomain = NULL; + ioapics[idx].irqdomain_cfg = *cfg; + + /* + * If mp_register_ioapic() is called during early boot stage when + * walking ACPI/SFI/DT tables, it's too early to create irqdomain, + * we are still using bootmem allocator. So delay it to setup_IO_APIC(). + */ + if (hotplug) { + if (mp_irqdomain_create(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOMEM; + } + alloc_ioapic_saved_registers(idx); + } + + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; + if (nr_ioapics <= idx) + nr_ioapics = idx + 1; + + /* Set nr_registers to mark entry present */ + ioapics[idx].nr_registers = entries; + + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + + return 0; +} + +int mp_unregister_ioapic(u32 gsi_base) +{ + int ioapic, pin; + int found = 0; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { + found = 1; + break; + } + if (!found) { + pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); + return -ENODEV; + } + + for_each_pin(ioapic, pin) { + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } + } + } + + /* Mark entry not present */ + ioapics[ioapic].nr_registers = 0; + ioapic_destroy_irqdomain(ioapic); + free_ioapic_saved_registers(ioapic); + if (ioapics[ioapic].iomem_res) + release_resource(ioapics[ioapic].iomem_res); + clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic); + memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic])); + + return 0; +} + +int mp_ioapic_registered(u32 gsi_base) +{ + int ioapic; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) + return 1; + + return 0; +} + +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->ioapic.valid) { + data->trigger = info->ioapic.trigger; + data->polarity = info->ioapic.polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; + } +} + +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) +{ + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. + */ + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; +} + +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; + unsigned long flags; + + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; + + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic.pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + info->ioapic.entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; + } + + INIT_LIST_HEAD(&data->irq_2_pin); + irq_data->hwirq = info->ioapic.pin; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + + local_irq_save(flags); + if (info->ioapic.entry) + mp_setup_entry(cfg, data, info->ioapic.entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + local_irq_restore(flags); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); + + return 0; +} + +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct mp_chip_data *data; + + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&data->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ioapic_configure_entry(irq_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return 0; +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c new file mode 100644 index 000000000..387154e39 --- /dev/null +++ b/arch/x86/kernel/apic/ipi.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "local.h" + +DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +#ifdef CONFIG_SMP +static int apic_ipi_shorthand_off __ro_after_init; + +static __init int apic_ipi_shorthand(char *str) +{ + get_option(&str, &apic_ipi_shorthand_off); + return 1; +} +__setup("no_ipi_broadcast=", apic_ipi_shorthand); + +static int __init print_ipi_mode(void) +{ + pr_info("IPI shorthand broadcast: %s\n", + apic_ipi_shorthand_off ? "disabled" : "enabled"); + return 0; +} +late_initcall(print_ipi_mode); + +void apic_smt_update(void) +{ + /* + * Do not switch to broadcast mode if: + * - Disabled on the command line + * - Only a single CPU is online + * - Not all present CPUs have been at least booted once + * + * The latter is important as the local APIC might be in some + * random state and a broadcast might cause havoc. That's + * especially true for NMI broadcasting. + */ + if (apic_ipi_shorthand_off || num_online_cpus() == 1 || + !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) { + static_branch_disable(&apic_use_ipi_shorthand); + } else { + static_branch_enable(&apic_use_ipi_shorthand); + } +} + +void apic_send_IPI_allbutself(unsigned int vector) +{ + if (num_online_cpus() < 2) + return; + + if (static_branch_likely(&apic_use_ipi_shorthand)) + apic->send_IPI_allbutself(vector); + else + apic->send_IPI_mask_allbutself(cpu_online_mask, vector); +} + +/* + * Send a 'reschedule' IPI to another CPU. It goes straight through and + * wastes no time serializing anything. Worst case is that we lose a + * reschedule ... + */ +void native_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); + return; + } + apic->send_IPI(cpu, RESCHEDULE_VECTOR); +} + +void native_send_call_func_single_ipi(int cpu) +{ + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); +} + +void native_send_call_func_ipi(const struct cpumask *mask) +{ + if (static_branch_likely(&apic_use_ipi_shorthand)) { + unsigned int cpu = smp_processor_id(); + + if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask)) + goto sendmask; + + if (cpumask_test_cpu(cpu, mask)) + apic->send_IPI_all(CALL_FUNCTION_VECTOR); + else if (num_online_cpus() > 1) + apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + return; + } + +sendmask: + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +} + +#endif /* CONFIG_SMP */ + +static inline int __prepare_ICR2(unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +static inline void __xapic_wait_icr_idle(void) +{ + while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field. Also the destination + * mode is ignored when a shorthand is used. + */ + cfg = __prepare_ICR(shortcut, vector, 0); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +void default_send_IPI_single_phys(int cpu, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) +{ + unsigned long query_cpu; + unsigned long flags; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicast to each CPU instead. + * - mbligh + */ + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int query_cpu; + unsigned long flags; + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +/* + * Helper function for APICs which insist on cpumasks + */ +void default_send_IPI_single(int cpu, int vector) +{ + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + +void default_send_IPI_allbutself(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +} + +void default_send_IPI_all(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); +} + +void default_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector); +} + +#ifdef CONFIG_X86_32 + +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicasts to each CPU instead. This + * should be modified to do 1 message per cluster ID - mbligh + */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + __default_send_IPI_dest_field( + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field( + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); + } + local_irq_restore(flags); +} + +/* + * This is only used on smaller machines. + */ +void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + unsigned long flags; + + if (!mask) + return; + + local_irq_save(flags); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); + __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + local_irq_restore(flags); +} + +/* must come after the send_IPI functions above for inlining */ +static int convert_apicid_to_cpu(int apic_id) +{ + int i; + + for_each_possible_cpu(i) { + if (per_cpu(x86_cpu_to_apicid, i) == apic_id) + return i; + } + return -1; +} + +int safe_smp_processor_id(void) +{ + int apicid, cpuid; + + if (!boot_cpu_has(X86_FEATURE_APIC)) + return 0; + + apicid = hard_smp_processor_id(); + if (apicid == BAD_APICID) + return 0; + + cpuid = convert_apicid_to_cpu(apicid); + + return cpuid >= 0 ? cpuid : 0; +} +#endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h new file mode 100644 index 000000000..a997d8495 --- /dev/null +++ b/arch/x86/kernel/apic/local.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Historical copyright notices: + * + * Copyright 2004 James Cleverdon, IBM. + * (c) 1995 Alan Cox, Building #3 + * (c) 1998-99, 2000 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + */ + +#include + +#include +#include + +/* APIC flat 64 */ +void flat_init_apic_ldr(void); + +/* X2APIC */ +int x2apic_apic_id_valid(u32 apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); +void __x2apic_send_IPI_shorthand(int vector, u32 which); + +/* IPI */ + +DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, + unsigned int dest) +{ + unsigned int icr = shortcut | dest; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector); + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); + +void default_send_IPI_single(int cpu, int vector); +void default_send_IPI_single_phys(int cpu, int vector); +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); +void default_send_IPI_allbutself(int vector); +void default_send_IPI_all(int vector); +void default_send_IPI_self(int vector); + +#ifdef CONFIG_X86_32 +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +#endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c new file mode 100644 index 000000000..6bd98a20f --- /dev/null +++ b/arch/x86/kernel/apic/msi.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Support of MSI, HPET and DMAR interrupts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct irq_domain *x86_pci_msi_default_domain __ro_after_init; + +static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) +{ + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL : + MSI_ADDR_DEST_MODE_LOGICAL) | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID(cfg->dest_apicid); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + MSI_DATA_DELIVERY_FIXED | + MSI_DATA_VECTOR(cfg->vector); +} + +void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg); +} + +static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) +{ + struct msi_msg msg[2] = { [1] = { }, }; + + __irq_msi_compose_msg(cfg, msg); + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); +} + +static int +msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) +{ + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); + struct irq_data *parent = irqd->parent_data; + unsigned int cpu; + int ret; + + /* Save the current configuration */ + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + old_cfg = *cfg; + + /* Allocate a new target vector */ + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * For non-maskable and non-remapped MSI interrupts the migration + * to a different destination CPU and a different vector has to be + * done careful to handle the possible stray interrupt which can be + * caused by the non-atomic update of the address/data pair. + * + * Direct update is possible when: + * - The MSI is maskable (remapped MSI does not use this code path)). + * The quirk bit is not set in this case. + * - The new vector is the same as the old vector + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up + * - The new destination CPU is the same as the old destination CPU + */ + if (!irqd_msi_nomask_quirk(irqd) || + cfg->vector == old_cfg.vector || + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || + cfg->dest_apicid == old_cfg.dest_apicid) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Paranoia: Validate that the interrupt target is the local + * CPU. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Redirect the interrupt to the new vector on the current CPU + * first. This might cause a spurious interrupt on this vector if + * the device raises an interrupt right between this update and the + * update to the final destination CPU. + * + * If the vector is in use then the installed device handler will + * denote it as spurious which is no harm as this is a rare event + * and interrupt handlers have to cope with spurious interrupts + * anyway. If the vector is unused, then it is marked so it won't + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). + * + * This requires to hold vector lock to prevent concurrent updates to + * the affected vector. + */ + lock_vector_lock(); + + /* + * Mark the new target vector on the local CPU if it is currently + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in + * the CPU hotplug path for a similar purpose. This cannot be + * undone here as the current CPU has interrupts disabled and + * cannot handle the interrupt before the whole set_affinity() + * section is done. In the CPU unplug case, the current CPU is + * about to vanish and will not handle any interrupts anymore. The + * vector is cleaned up when the CPU comes online again. + */ + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); + + /* Redirect it to the new vector on the local CPU temporarily */ + old_cfg.vector = cfg->vector; + irq_msi_update_msg(irqd, &old_cfg); + + /* Now transition it to the target CPU */ + irq_msi_update_msg(irqd, cfg); + + /* + * All interrupts after this point are now targeted at the new + * vector/CPU. + * + * Drop vector lock before testing whether the temporary assignment + * to the local CPU was hit by an interrupt raised in the device, + * because the retrigger function acquires vector lock again. + */ + unlock_vector_lock(); + + /* + * Check whether the transition raced with a device interrupt and + * is pending in the local APICs IRR. It is safe to do this outside + * of vector lock as the irq_desc::lock of this interrupt is still + * held and interrupts are disabled: The check is not accessing the + * underlying vector store. It's just checking the local APIC's + * IRR. + */ + if (lapic_vector_set_in_irr(cfg->vector)) + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); + + return ret; +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip pci_msi_controller = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_affinity = msi_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, + msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } + + return 0; +} +EXPORT_SYMBOL_GPL(pci_msi_prepare); + +static struct msi_domain_ops pci_msi_domain_ops = { + .msi_prepare = pci_msi_prepare, +}; + +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +struct irq_domain * __init native_create_pci_msi_domain(void) +{ + struct fwnode_handle *fn; + struct irq_domain *d; + + if (disable_apic) + return NULL; + + fn = irq_domain_alloc_named_fwnode("PCI-MSI"); + if (!fn) + return NULL; + + d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + x86_vector_domain); + if (!d) { + irq_domain_free_fwnode(fn); + pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); + } else { + d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + } + return d; +} + +void __init x86_create_pci_msi_domain(void) +{ + x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); +} + +#ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, + const char *name, int id) +{ + struct fwnode_handle *fn; + struct irq_domain *d; + + fn = irq_domain_alloc_named_id_fwnode(name, id); + if (!fn) + return NULL; + d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); + if (!d) + irq_domain_free_fwnode(fn); + return d; +} +#endif + +#ifdef CONFIG_DMAR_TABLE +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + +static struct irq_chip dmar_msi_controller = { + .name = "DMAR-MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = dmar_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static struct msi_domain_ops dmar_msi_domain_ops = { + .msi_init = dmar_msi_init, +}; + +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + struct fwnode_handle *fn; + + mutex_lock(&dmar_lock); + if (dmar_domain) + goto out; + + fn = irq_domain_alloc_named_fwnode("DMAR-MSI"); + if (fn) { + dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, + x86_vector_domain); + if (!dmar_domain) + irq_domain_free_fwnode(fn); + } +out: + mutex_unlock(&dmar_lock); + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.devid = id; + info.hwirq = id; + info.data = arg; + + return irq_domain_alloc_irqs(domain, 1, node, &info); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} +#endif + +/* + * MSI message composition + */ +#ifdef CONFIG_HPET_TIMER +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct irq_alloc_info info; + struct fwnode_handle *fn; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; + info.devid = hpet_id; + parent = irq_remapping_get_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} +#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c new file mode 100644 index 000000000..67b6f7c04 --- /dev/null +++ b/arch/x86/kernel/apic/probe_32.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Generic x86 APIC driver probe layer. + */ +#include +#include +#include + +#include +#include +#include + +#include "local.h" + +static int default_x86_32_early_logical_apicid(int cpu) +{ + return 1 << cpu; +} + +static void setup_apic_flat_routing(void) +{ +#ifdef CONFIG_X86_IO_APIC + printk(KERN_INFO + "Enabling APIC mode: Flat. Using %d I/O APICs\n", + nr_ioapics); +#endif +} + +static int default_apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +/* + * Set up the logical destination ID. Intel recommends to set DFR, LDR and + * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" + * (Intel document number 292116). + */ +static void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +static int default_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +/* should be called last. */ +static int probe_default(void) +{ + return 1; +} + +static struct apic apic_default __ro_after_init = { + + .name = "default", + .probe = probe_default, + .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = default_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = default_check_apicid_used, + + .init_apic_ldr = default_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = setup_apic_flat_routing, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = default_phys_pkg_id, + + .get_apic_id = default_get_apic_id, + .set_apic_id = NULL, + + .calc_dest_apicid = apic_flat_calc_apicid, + + .send_IPI = default_send_IPI_single, + .send_IPI_mask = default_send_IPI_mask_logical, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, +}; + +apic_driver(apic_default); + +struct apic *apic __ro_after_init = &apic_default; +EXPORT_SYMBOL_GPL(apic); + +static int cmdline_apic __initdata; +static int __init parse_apic(char *arg) +{ + struct apic **drv; + + if (!arg) + return -EINVAL; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic = *drv; + cmdline_apic = 1; + return 0; + } + } + + /* Parsed again by __setup for debug/verbose */ + return 0; +} +early_param("apic", parse_apic); + +void __init default_setup_apic_routing(void) +{ + int version = boot_cpu_apic_version; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* P4 and above */ + fallthrough; + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + /* + * This is used to switch to bigsmp mode when + * - There is no apic= option specified by the user + * - generic_apic_probe() has chosen apic_default as the sub_arch + * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support + */ + + if (!cmdline_apic && apic == &apic_default) + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +void __init generic_apic_probe(void) +{ + if (!cmdline_apic) { + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic = *drv; + break; + } + } + /* Not visible without early console */ + if (drv == __apicdrivers_end) + panic("Didn't find an APIC driver"); + } + printk(KERN_INFO "Using APIC driver %s\n", apic->name); +} + +/* This function can switch the APIC even after the initial ->probe() */ +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!(*drv)->acpi_madt_oem_check) + continue; + if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) + continue; + + if (!cmdline_apic) { + apic = *drv; + printk(KERN_INFO "Switched to APIC driver `%s'.\n", + apic->name); + } + return 1; + } + return 0; +} diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c new file mode 100644 index 000000000..c46720f18 --- /dev/null +++ b/arch/x86/kernel/apic/probe_64.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2004 James Cleverdon, IBM. + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include + +#include "local.h" + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init default_setup_apic_routing(void) +{ + struct apic **drv; + + enable_IR_x2apic(); + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched APIC routing to %s.\n", + apic->name); + } + break; + } + } +} + +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + if (apic != *drv) { + apic = *drv; + pr_info("Setting APIC routing to %s.\n", + apic->name); + } + return 1; + } + } + return 0; +} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c new file mode 100644 index 000000000..bd557e9f5 --- /dev/null +++ b/arch/x86/kernel/apic/vector.c @@ -0,0 +1,1278 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Local APIC related interfaces to support IOAPIC, MSI, etc. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct apic_chip_data { + struct irq_cfg hw_irq_cfg; + unsigned int vector; + unsigned int prev_vector; + unsigned int cpu; + unsigned int prev_cpu; + unsigned int irq; + struct hlist_node clist; + unsigned int move_in_progress : 1, + is_managed : 1, + can_reserve : 1, + has_reserved : 1; +}; + +struct irq_domain *x86_vector_domain; +EXPORT_SYMBOL_GPL(x86_vector_domain); +static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_searchmask; +static struct irq_chip lapic_controller; +static struct irq_matrix *vector_matrix; +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct hlist_head, cleanup_list); +#endif + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + raw_spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + raw_spin_unlock(&vector_lock); +} + +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) +{ + if (!irqd) + return NULL; + + while (irqd->parent_data) + irqd = irqd->parent_data; + + return irqd->chip_data; +} + +struct irq_cfg *irqd_cfg(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + return apicd ? &apicd->hw_irq_cfg : NULL; +} +EXPORT_SYMBOL_GPL(irqd_cfg); + +struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irqd_cfg(irq_get_irq_data(irq)); +} + +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *apicd; + + apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); + if (apicd) + INIT_HLIST_NODE(&apicd->clist); + return apicd; +} + +static void free_apic_chip_data(struct apic_chip_data *apicd) +{ + kfree(apicd); +} + +static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, + unsigned int cpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + lockdep_assert_held(&vector_lock); + + apicd->hw_irq_cfg.vector = vector; + apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + trace_vector_config(irqd->irq, vector, cpu, + apicd->hw_irq_cfg.dest_apicid); +} + +static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, + unsigned int newcpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); + + lockdep_assert_held(&vector_lock); + + trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, + apicd->cpu); + + /* + * If there is no vector associated or if the associated vector is + * the shutdown vector, which is associated to make PCI/MSI + * shutdown mode work, then there is nothing to release. Clear out + * prev_vector for this and the offlined target case. + */ + apicd->prev_vector = 0; + if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* + * If the target CPU of the previous vector is online, then mark + * the vector as move in progress and store it for cleanup when the + * first interrupt on the new vector arrives. If the target CPU is + * offline then the regular release mechanism via the cleanup + * vector is not possible and the vector can be immediately freed + * in the underlying matrix allocator. + */ + if (cpu_online(apicd->cpu)) { + apicd->move_in_progress = true; + apicd->prev_vector = apicd->vector; + apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); + } else { + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); + } + +setnew: + apicd->vector = newvec; + apicd->cpu = newcpu; + BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); + per_cpu(vector_irq, newcpu)[newvec] = desc; +} + +static void vector_assign_managed_shutdown(struct irq_data *irqd) +{ + unsigned int cpu = cpumask_first(cpu_online_mask); + + apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu); +} + +static int reserve_managed_vector(struct irq_data *irqd) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + apicd->is_managed = true; + ret = irq_matrix_reserve_managed(vector_matrix, affmsk); + raw_spin_unlock_irqrestore(&vector_lock, flags); + trace_vector_reserve_managed(irqd->irq, ret); + return ret; +} + +static void reserve_irq_vector_locked(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + irq_matrix_reserve(vector_matrix); + apicd->can_reserve = true; + apicd->has_reserved = true; + irqd_set_can_reserve(irqd); + trace_vector_reserve(irqd->irq, 0); + vector_assign_managed_shutdown(irqd); +} + +static int reserve_irq_vector(struct irq_data *irqd) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + reserve_irq_vector_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return 0; +} + +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool resvd = apicd->has_reserved; + unsigned int cpu = apicd->cpu; + int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + /* + * If the current target CPU is online and in the new requested + * affinity mask, there is no point in moving the interrupt from + * one CPU to another. + */ + if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) + return 0; + + /* + * Careful here. @apicd might either have move_in_progress set or + * be enqueued for cleanup. Assigning a new vector would either + * leave a stale vector on some CPU around or in case of a pending + * cleanup corrupt the hlist. + */ + if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist)) + return -EBUSY; + + vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); + trace_vector_alloc(irqd->irq, vector, resvd, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + + return 0; +} + +static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&vector_lock, flags); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + ret = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static int assign_irq_vector_any_locked(struct irq_data *irqd) +{ + /* Get the affinity mask - either irq_default_affinity or (user) set */ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + int node = irq_data_get_node(irqd); + + if (node != NUMA_NO_NODE) { + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + } + + /* Try the full affinity mask */ + cpumask_and(vector_searchmask, affmsk, cpu_online_mask); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + + if (node != NUMA_NO_NODE) { + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; + } + + /* Try the full online mask */ + return assign_vector_locked(irqd, cpu_online_mask); +} + +static int +assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) +{ + if (irqd_affinity_is_managed(irqd)) + return reserve_managed_vector(irqd); + if (info->mask) + return assign_irq_vector(irqd, info->mask); + /* + * Make only a global reservation with no guarantee. A real vector + * is associated at activation time. + */ + return reserve_irq_vector(irqd); +} + +static int +assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; + + cpumask_and(vector_searchmask, dest, affmsk); + + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) + return 0; + vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask, + &cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + return 0; +} + +static void clear_irq_vector(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool managed = irqd_affinity_is_managed(irqd); + unsigned int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + if (!vector) + return; + + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, + apicd->prev_cpu); + + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; + irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apicd->vector = 0; + + /* Clean up move in progress */ + vector = apicd->prev_vector; + if (!vector) + return; + + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); +} + +static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + trace_vector_deactivate(irqd->irq, apicd->is_managed, + apicd->can_reserve, false); + + /* Regular fixed assigned interrupt */ + if (!apicd->is_managed && !apicd->can_reserve) + return; + /* If the interrupt has a global reservation, nothing to do */ + if (apicd->has_reserved) + return; + + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + if (apicd->can_reserve) + reserve_irq_vector_locked(irqd); + else + vector_assign_managed_shutdown(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +static int activate_reserved(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int ret; + + ret = assign_irq_vector_any_locked(irqd); + if (!ret) { + apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } + + /* + * Check to ensure that the effective affinity mask is a subset + * the user supplied affinity mask, and warn the user if it is not + */ + if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd), + irq_data_get_affinity_mask(irqd))) { + pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n", + irqd->irq); + } + + return ret; +} + +static int activate_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + int ret; + + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { + /* Something in the core code broke! Survive gracefully */ + pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); + return -EINVAL; + } + + ret = assign_managed_vector(irqd, vector_searchmask); + /* + * This should not happen. The vector reservation got buggered. Handle + * it gracefully. + */ + if (WARN_ON_ONCE(ret < 0)) { + pr_err("Managed startup irq %u, no vector available\n", + irqd->irq); + } + return ret; +} + +static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, + bool reserve) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret = 0; + + trace_vector_activate(irqd->irq, apicd->is_managed, + apicd->can_reserve, reserve); + + raw_spin_lock_irqsave(&vector_lock, flags); + if (!apicd->can_reserve && !apicd->is_managed) + assign_irq_vector_any_locked(irqd); + else if (reserve || irqd_is_managed_and_shutdown(irqd)) + vector_assign_managed_shutdown(irqd); + else if (apicd->is_managed) + ret = activate_managed(irqd); + else if (apicd->has_reserved) + ret = activate_reserved(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void vector_free_reserved_and_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + + trace_vector_teardown(irqd->irq, apicd->is_managed, + apicd->has_reserved); + + if (apicd->has_reserved) + irq_matrix_remove_reserved(vector_matrix); + if (apicd->is_managed) + irq_matrix_remove_managed(vector_matrix, dest); +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned long flags; + int i; + + for (i = 0; i < nr_irqs; i++) { + irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irqd && irqd->chip_data) { + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + vector_free_reserved_and_managed(irqd); + apicd = irqd->chip_data; + irq_domain_reset_irq_data(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_apic_chip_data(apicd); + } + } +} + +static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, + struct apic_chip_data *apicd) +{ + unsigned long flags; + bool realloc = false; + + apicd->vector = ISA_IRQ_VECTOR(virq); + apicd->cpu = 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + /* + * If the interrupt is activated, then it must stay at this vector + * position. That's usually the timer interrupt (0). + */ + if (irqd_is_activated(irqd)) { + trace_vector_setup(virq, true, 0); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); + } else { + /* Release the vector */ + apicd->can_reserve = true; + irqd_set_can_reserve(irqd); + clear_irq_vector(irqd); + realloc = true; + } + raw_spin_unlock_irqrestore(&vector_lock, flags); + return realloc; +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct apic_chip_data *apicd; + struct irq_data *irqd; + int i, err, node; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + for (i = 0; i < nr_irqs; i++) { + irqd = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irqd); + node = irq_data_get_node(irqd); + WARN_ON_ONCE(irqd->chip_data); + apicd = alloc_apic_chip_data(node); + if (!apicd) { + err = -ENOMEM; + goto error; + } + + apicd->irq = virq + i; + irqd->chip = &lapic_controller; + irqd->chip_data = apicd; + irqd->hwirq = virq + i; + irqd_set_single_target(irqd); + /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + + /* + * Legacy vectors are already assigned when the IOAPIC + * takes them over. They stay on the same vector. This is + * required for check_timer() to work correctly as it might + * switch back to legacy mode. Only update the hardware + * config. + */ + if (info->flags & X86_IRQ_ALLOC_LEGACY) { + if (!vector_configure_legacy(virq + i, irqd, apicd)) + continue; + } + + err = assign_irq_vector_policy(irqd, info); + trace_vector_setup(virq + i, false, err); + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); + goto error; + } + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i); + return err; +} + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct apic_chip_data apicd; + unsigned long flags; + int irq; + + if (!irqd) { + irq_matrix_debug_show(m, vector_matrix, ind); + return; + } + + irq = irqd->irq; + if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) { + seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq)); + seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, ""); + return; + } + + if (!irqd->chip_data) { + seq_printf(m, "%*sVector: Not assigned\n", ind, ""); + return; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + memcpy(&apicd, irqd->chip_data, sizeof(apicd)); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector); + seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu); + if (apicd.prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu); + } + seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0); + seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0); + seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0); + seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0); + seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist)); +} +#endif + +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, + .activate = x86_vector_activate, + .deactivate = x86_vector_deactivate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = x86_vector_debug_show, +#endif +}; + +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) + /* + * for MSI and HT dyn irq + */ + if (gsi_top <= NR_IRQS_LEGACY) + nr += 8 * nr_cpu_ids; + else + nr += gsi_top * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); +} + +void lapic_assign_legacy_vector(unsigned int irq, bool replace) +{ + /* + * Use assign system here so it wont get accounted as allocated + * and moveable in the cpu hotplug check and it prevents managed + * irq reservation from touching it. + */ + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); +} + +void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + +void __init lapic_assign_system_vectors(void) +{ + unsigned int i, vector = 0; + + for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + irq_matrix_assign_system(vector_matrix, vector, false); + + if (nr_legacy_irqs() > 1) + lapic_assign_legacy_vector(PIC_CASCADE_IR, false); + + /* System vectors are reserved, online it */ + irq_matrix_online(vector_matrix); + + /* Mark the preallocated legacy interrupts */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); + } +} + +int __init arch_early_irq_init(void) +{ + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("VECTOR"); + BUG_ON(!fn); + x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + + BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); + + /* + * Allocate the vector matrix allocator data structure and limit the + * search area. + */ + vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR, + FIRST_SYSTEM_VECTOR); + BUG_ON(!vector_matrix); + + return arch_early_ioapic_init(); +} + +#ifdef CONFIG_SMP + +static struct irq_desc *__setup_vector_irq(int vector) +{ + int isairq = vector - ISA_IRQ_VECTOR(0); + + /* Check whether the irq is in the legacy space */ + if (isairq < 0 || isairq >= nr_legacy_irqs()) + return VECTOR_UNUSED; + /* Check whether the irq is handled by the IOAPIC */ + if (test_bit(isairq, &io_apic_irqs)) + return VECTOR_UNUSED; + return irq_to_desc(isairq); +} + +/* Online the local APIC infrastructure and initialize the vectors */ +void lapic_online(void) +{ + unsigned int vector; + + lockdep_assert_held(&vector_lock); + + /* Online the vector matrix array for this CPU */ + irq_matrix_online(vector_matrix); + + /* + * The interrupt affinity logic never targets interrupts to offline + * CPUs. The exception are the legacy PIC interrupts. In general + * they are only targeted to CPU0, but depending on the platform + * they can be distributed to any online CPU in hardware. The + * kernel has no influence on that. So all active legacy vectors + * must be installed on all CPUs. All non legacy interrupts can be + * cleared. + */ + for (vector = 0; vector < NR_VECTORS; vector++) + this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); +} + +void lapic_offline(void) +{ + lock_vector_lock(); + irq_matrix_offline(vector_matrix); + unlock_vector_lock(); +} + +static int apic_set_affinity(struct irq_data *irqd, + const struct cpumask *dest, bool force) +{ + int err; + + if (WARN_ON_ONCE(!irqd_is_activated(irqd))) + return -EIO; + + raw_spin_lock(&vector_lock); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (irqd_affinity_is_managed(irqd)) + err = assign_managed_vector(irqd, vector_searchmask); + else + err = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock(&vector_lock); + return err ? err : IRQ_SET_MASK_OK; +} + +#else +# define apic_set_affinity NULL +#endif + +static int apic_retrigger_irq(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + apic->send_IPI(apicd->cpu, apicd->vector); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +void apic_ack_irq(struct irq_data *irqd) +{ + irq_move_irq(irqd); + ack_APIC_irq(); +} + +void apic_ack_edge(struct irq_data *irqd) +{ + irq_complete_move(irqd_cfg(irqd)); + apic_ack_irq(irqd); +} + +static struct irq_chip lapic_controller = { + .name = "APIC", + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, + .irq_retrigger = apic_retrigger_irq, +}; + +#ifdef CONFIG_SMP + +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; + + /* + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. + */ + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) +{ + struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); + struct apic_chip_data *apicd; + struct hlist_node *tmp; + + ack_APIC_irq(); + /* Prevent vectors vanishing under us */ + raw_spin_lock(&vector_lock); + + hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + unsigned int irr, vector = apicd->prev_vector; + + /* + * Paranoia: Check if the vector that needs to be cleaned + * up is registered at the APICs IRR. If so, then this is + * not the best time to clean it up. Clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest + * priority external vector, so on return from this + * interrupt the device interrupt will happen first. + */ + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1U << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + continue; + } + free_moved_vector(apicd); + } + + raw_spin_unlock(&vector_lock); +} + +static void __send_cleanup_vector(struct apic_chip_data *apicd) +{ + unsigned int cpu; + + raw_spin_lock(&vector_lock); + apicd->move_in_progress = 0; + cpu = apicd->prev_cpu; + if (cpu_online(cpu)) { + hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); + apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + } else { + apicd->prev_vector = 0; + } + raw_spin_unlock(&vector_lock); +} + +void send_cleanup_vector(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (apicd->move_in_progress) + __send_cleanup_vector(apicd); +} + +void irq_complete_move(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (likely(!apicd->move_in_progress)) + return; + + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) + __send_cleanup_vector(apicd); +} + +/* + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. + */ +void irq_force_complete_move(struct irq_desc *desc) +{ + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned int vector; + + /* + * The function is called for all descriptors regardless of which + * irqdomain they belong to. For example if an IRQ is provided by + * an irq_chip as part of a GPIO driver, the chip data for that + * descriptor is specific to the irq_chip in question. + * + * Check first that the chip_data is what we expect + * (apic_chip_data) before touching it any further. + */ + irqd = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqd) + return; + + raw_spin_lock(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + goto unlock; + + /* + * If prev_vector is empty, no action required. + */ + vector = apicd->prev_vector; + if (!vector) + goto unlock; + + /* + * This is tricky. If the cleanup of the old vector has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. + * + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (apicd->move_in_progress) { + /* + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendevouz in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theroretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. + */ + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqd->irq, vector); + } + free_moved_vector(apicd); +unlock: + raw_spin_unlock(&vector_lock); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Note, this is not accurate accounting, but at least good enough to + * prevent that the actual interrupt move will run out of vectors. + */ +int lapic_can_unplug_cpu(void) +{ + unsigned int rsvd, avl, tomove, cpu = smp_processor_id(); + int ret = 0; + + raw_spin_lock(&vector_lock); + tomove = irq_matrix_allocated(vector_matrix); + avl = irq_matrix_available(vector_matrix, true); + if (avl < tomove) { + pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n", + cpu, tomove, avl); + ret = -ENOSPC; + goto out; + } + rsvd = irq_matrix_reserved(vector_matrix); + if (avl < rsvd) { + pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n", + rsvd, avl); + } +out: + raw_spin_unlock(&vector_lock); + return ret; +} +#endif /* HOTPLUG_CPU */ +#endif /* SMP */ + +static void __init print_APIC_field(int base) +{ + int i; + + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + pr_cont("%08x", apic_read(base + i*0x10)); + + pr_cont("\n"); +} + +static void __init print_local_APIC(void *dummy) +{ + unsigned int i, v, ver, maxlvt; + u64 icr; + + pr_debug("printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); + v = apic_read(APIC_LVR); + pr_info("... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = lapic_get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + pr_debug("... APIC ARBPRI: %08x (%02x)\n", + v, v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + pr_debug("... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + pr_debug("... APIC RRR: %08x\n", v); + } + + v = apic_read(APIC_LDR); + pr_debug("... APIC LDR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + pr_debug("... APIC DFR: %08x\n", v); + } + v = apic_read(APIC_SPIV); + pr_debug("... APIC SPIV: %08x\n", v); + + pr_debug("... APIC ISR field:\n"); + print_APIC_field(APIC_ISR); + pr_debug("... APIC TMR field:\n"); + print_APIC_field(APIC_TMR); + pr_debug("... APIC IRR field:\n"); + print_APIC_field(APIC_IRR); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + pr_debug("... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + pr_debug("... APIC ICR: %08x\n", (u32)icr); + pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + pr_debug("... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { + /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + pr_debug("... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + pr_debug("... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + pr_debug("... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { + /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + pr_debug("... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + pr_debug("... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + pr_debug("... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + pr_debug("... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + pr_debug("... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + pr_debug("... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + pr_debug("... APIC EILVT%d: %08x\n", i, v); + } + } + pr_cont("\n"); +} + +static void __init print_local_APICs(int maxcpu) +{ + int cpu; + + if (!maxcpu) + return; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } + preempt_enable(); +} + +static void __init print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (!nr_legacy_irqs()) + return; + + pr_debug("\nprinting PIC contents\n"); + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + pr_debug("... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + pr_debug("... PIC IRR: %04x\n", v); + + outb(0x0b, 0xa0); + outb(0x0b, 0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a, 0xa0); + outb(0x0a, 0x20); + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + pr_debug("... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + pr_debug("... PIC ELCR: %04x\n", v); +} + +static int show_lapic __initdata = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +static int __init print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + + print_PIC(); + + /* don't print out if apic is not there */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) + return 0; + + print_local_APICs(show_lapic); + print_IO_APICs(); + + return 0; +} + +late_initcall(print_ICs); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c new file mode 100644 index 000000000..7eec3c154 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include + +#include "local.h" + +struct cluster_mask { + unsigned int clusterid; + int node; + struct cpumask mask; +}; + +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); +static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static struct cluster_mask *cluster_hotplug_mask; + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled(); +} + +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + unsigned int cpu, clustercpu; + struct cpumask *tmpmsk; + unsigned long flags; + u32 dest; + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + local_irq_save(flags); + + tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); + cpumask_copy(tmpmsk, mask); + /* If IPI should not be sent to self, clear current CPU */ + if (apic_dest != APIC_DEST_ALLINC) + __cpumask_clear_cpu(smp_processor_id(), tmpmsk); + + /* Collapse cpus in a cluster so a single IPI per cluster is sent */ + for_each_cpu(cpu, tmpmsk) { + struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); + + dest = 0; + for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + /* Remove cluster CPUs from tmpmask */ + cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); + } + + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_all(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); +} + +static u32 x2apic_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_logical_apicid, cpu); +} + +static void init_x2apic_ldr(void) +{ + struct cluster_mask *cmsk = this_cpu_read(cluster_masks); + u32 cluster, apicid = apic_read(APIC_LDR); + unsigned int cpu; + + this_cpu_write(x86_cpu_to_logical_apicid, apicid); + + if (cmsk) + goto update; + + cluster = apicid >> 16; + for_each_online_cpu(cpu) { + cmsk = per_cpu(cluster_masks, cpu); + /* Matching cluster found. Link and update it. */ + if (cmsk && cmsk->clusterid == cluster) + goto update; + } + cmsk = cluster_hotplug_mask; + cmsk->clusterid = cluster; + cluster_hotplug_mask = NULL; +update: + this_cpu_write(cluster_masks, cmsk); + cpumask_set_cpu(smp_processor_id(), &cmsk->mask); +} + +static int alloc_clustermask(unsigned int cpu, int node) +{ + if (per_cpu(cluster_masks, cpu)) + return 0; + /* + * If a hotplug spare mask exists, check whether it's on the right + * node. If not, free it and allocate a new one. + */ + if (cluster_hotplug_mask) { + if (cluster_hotplug_mask->node == node) + return 0; + kfree(cluster_hotplug_mask); + } + + cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), + GFP_KERNEL, node); + if (!cluster_hotplug_mask) + return -ENOMEM; + cluster_hotplug_mask->node = node; + return 0; +} + +static int x2apic_prepare_cpu(unsigned int cpu) +{ + if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + return -ENOMEM; + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static int x2apic_dead_cpu(unsigned int dead_cpu) +{ + struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); + + if (cmsk) + cpumask_clear_cpu(dead_cpu, &cmsk->mask); + free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); + return 0; +} + +static int x2apic_cluster_probe(void) +{ + if (!x2apic_mode) + return 0; + + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", + x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); + return 0; + } + init_x2apic_ldr(); + return 1; +} + +static struct apic apic_x2apic_cluster __ro_after_init = { + + .name = "cluster x2apic", + .probe = x2apic_cluster_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = x2apic_apic_id_valid, + .apic_id_registered = x2apic_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 1, /* logical */ + + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + + .init_apic_ldr = init_x2apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = x2apic_phys_pkg_id, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + + .calc_dest_apicid = x2apic_calc_apicid, + + .send_IPI = x2apic_send_IPI, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c new file mode 100644 index 000000000..76c80e191 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "local.h" + +int x2apic_phys; + +static struct apic apic_x2apic_phys; +static u32 x2apic_max_apicid __ro_after_init; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; +} + +static int __init set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static bool x2apic_fadt_phys(void) +{ +#ifdef CONFIG_ACPI + if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return true; + } +#endif + return false; +} + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); +} + +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + unsigned long query_cpu; + unsigned long this_cpu; + unsigned long flags; + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + + local_irq_save(flags); + + this_cpu = smp_processor_id(); + for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_all(int vector) +{ + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); +} + +static void init_x2apic_ldr(void) +{ +} + +static int x2apic_phys_probe(void) +{ + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) + return 1; + + return apic == &apic_x2apic_phys; +} + +/* Common x2apic functions, also used by x2apic_cluster */ +int x2apic_apic_id_valid(u32 apicid) +{ + if (x2apic_max_apicid && apicid > x2apic_max_apicid) + return 0; + + return 1; +} + +int x2apic_apic_id_registered(void) +{ + return 1; +} + +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +void __x2apic_send_IPI_shorthand(int vector, u32 which) +{ + unsigned long cfg = __prepare_ICR(which, vector, 0); + + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); +} + +unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +u32 x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static struct apic apic_x2apic_phys __ro_after_init = { + + .name = "physical x2apic", + .probe = x2apic_phys_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = x2apic_apic_id_valid, + .apic_id_registered = x2apic_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + + .init_apic_ldr = init_x2apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = x2apic_phys_pkg_id, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = x2apic_send_IPI, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c new file mode 100644 index 000000000..9c283562d --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -0,0 +1,1820 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV APIC functions (note: not an Intel compatible APIC) + * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static enum uv_system_type uv_system_type; +static int uv_hubbed_system; +static int uv_hubless_system; +static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; +static int uv_node_id; + +/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ +static u8 uv_archtype[UV_AT_SIZE + 1]; +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + +/* Information derived from CPUID and some UV MMRs */ +static struct { + unsigned int apicid_shift; + unsigned int apicid_mask; + unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ + unsigned int pnode_mask; + unsigned int nasid_shift; + unsigned int gpa_shift; + unsigned int gnode_shift; + unsigned int m_skt; + unsigned int n_skt; +} uv_cpuid; + +static int uv_min_hub_revision_id; + +static struct apic apic_x2apic_uv_x; +static struct uv_hub_info_s uv_hub_info_node0; + +/* Set this to use hardware error handler instead of kernel panic: */ +static int disable_uv_undefined_panic = 1; + +unsigned long uv_undefined(char *str) +{ + if (likely(!disable_uv_undefined_panic)) + panic("UV: error: undefined MMR: %s\n", str); + else + pr_crit("UV: error: undefined MMR: %s\n", str); + + /* Cause a machine fault: */ + return ~0ul; +} +EXPORT_SYMBOL(uv_undefined); + +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + + return val; +} + +static inline bool is_GRU_range(u64 start, u64 end) +{ + if (!gru_start_paddr) + return false; + + return start >= gru_start_paddr && end <= gru_end_paddr; +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} + +static void __init early_get_pnodeid(void) +{ + int pnode; + + uv_cpuid.m_skt = 0; + if (UVH_RH10_GAM_ADDR_MAP_CONFIG) { + union uvh_rh10_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + uv_cpuid.nasid_shift = 0; + } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { + union uvh_rh_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + if (is_uv(UV3)) + uv_cpuid.m_skt = m_n_config.s3.m_skt; + if (is_uv(UV2)) + uv_cpuid.m_skt = m_n_config.s2.m_skt; + uv_cpuid.nasid_shift = 1; + } else { + unsigned long GAM_ADDR_MAP_CONFIG = 0; + + WARN(GAM_ADDR_MAP_CONFIG == 0, + "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n"); + uv_cpuid.n_skt = 0; + uv_cpuid.nasid_shift = 0; + } + + if (is_uv(UV4|UVY)) + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + + uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1; + pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* Default unless changed */ + + pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n", + uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode); +} + +/* Running on a UV Hubbed system, determine which UV Hub Type it is */ +static int __init early_set_hub_type(void) +{ + union uvh_node_id_u node_id; + + /* + * The NODE_ID MMR is always at offset 0. + * Contains the chip part # + revision. + * Node_id field started with 15 bits, + * ... now 7 but upper 8 are masked to 0. + * All blades/nodes have the same part # and hub revision. + */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + uv_node_id = node_id.sx.node_id; + + switch (node_id.s.part_number) { + + case UV5_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV5_HUB_REVISION_BASE; + uv_hub_type_set(UV5); + break; + + /* UV4/4A only have a revision difference */ + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV4_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV4); + if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) + uv_hub_type_set(UV4|UV4A); + break; + + case UV3_HUB_PART_NUMBER: + case UV3_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV3_HUB_REVISION_BASE; + uv_hub_type_set(UV3); + break; + + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV2_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV2); + break; + + default: + return 0; + } + + pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n", + node_id.s.part_number, node_id.s.revision, + uv_min_hub_revision_id, is_uv(~0)); + + return 1; +} + +static void __init uv_tsc_check_sync(void) +{ + u64 mmr; + int sync_state; + int mmr_shift; + char *state; + + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ + mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); + mmr_shift = + is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + + /* Check if TSC is valid for all sockets */ + switch (sync_state) { + case UVH_TSC_SYNC_VALID: + state = "in sync"; + mark_tsc_async_resets("UV BIOS"); + break; + + /* If BIOS state unknown, don't do anything */ + case UVH_TSC_SYNC_UNKNOWN: + state = "unknown"; + break; + + /* Otherwise, BIOS indicates problem with TSC */ + default: + state = "unstable"; + mark_tsc_unstable("UV BIOS"); + break; + } + pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); +} + +/* Selector for (4|4A|5) structs */ +#define uvxy_field(sname, field, undef) ( \ + is_uv(UV4A) ? sname.s4a.field : \ + is_uv(UV4) ? sname.s4.field : \ + is_uv(UV3) ? sname.s3.field : \ + undef) + +/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ + +#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) + +static void set_x2apic_bits(void) +{ + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int sid_shift; + + cpuid(0, &eax, &ebx, &ecx, &edx); + if (eax < 0xb) { + pr_info("UV: CPU does not have CPUID.11\n"); + return; + } + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { + pr_info("UV: CPUID.11 not implemented\n"); + return; + } + + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; +} + +static void __init early_get_apic_socketid_shift(void) +{ + if (is_uv2_hub() || is_uv3_hub()) + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + + set_x2apic_bits(); + + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); +} + +static void __init uv_stringify(int len, char *to, char *from) +{ + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); + + /* Trim trailing spaces */ + (void)strim(to); +} + +/* Find UV arch type entry in UVsystab */ +static unsigned long __init early_find_archtype(struct uv_systab *st) +{ + int i; + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + ptr += (unsigned long)st; + if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE) + return ptr; + } + return 0; +} + +/* Validate UV arch type field in UVsystab */ +static int __init decode_arch_type(unsigned long ptr) +{ + struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr; + int n = strlen(uv_ate->archtype); + + if (n > 0 && n < sizeof(uv_ate->archtype)) { + pr_info("UV: UVarchtype received from BIOS\n"); + uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype); + return 1; + } + return 0; +} + +/* Determine if UV arch type entry might exist in UVsystab */ +static int __init early_get_arch_type(void) +{ + unsigned long uvst_physaddr, uvst_size, ptr; + struct uv_systab *st; + u32 rev; + int ret; + + uvst_physaddr = get_uv_systab_phys(0); + if (!uvst_physaddr) + return 0; + + st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab)); + if (!st) { + pr_err("UV: Cannot access UVsystab, remap failed\n"); + return 0; + } + + rev = st->revision; + if (rev < UV_SYSTAB_VERSION_UV5) { + early_memunmap(st, sizeof(struct uv_systab)); + return 0; + } + + uvst_size = st->size; + early_memunmap(st, sizeof(struct uv_systab)); + st = early_memremap_ro(uvst_physaddr, uvst_size); + if (!st) { + pr_err("UV: Cannot access UVarchtype, remap failed\n"); + return 0; + } + + ptr = early_find_archtype(st); + if (!ptr) { + early_memunmap(st, uvst_size); + return 0; + } + + ret = decode_arch_type(ptr); + early_memunmap(st, uvst_size); + return ret; +} + +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) +{ + /* Save OEM_ID passed from ACPI MADT */ + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + + /* Check if BIOS sent us a UVarchtype */ + if (!early_get_arch_type()) + + /* If not use OEM ID for UVarchtype */ + uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id); + + /* Check if not hubbed */ + if (strncmp(uv_archtype, "SGI", 3) != 0) { + + /* (Not hubbed), check if not hubless */ + if (strncmp(uv_archtype, "NSGI", 4) != 0) + + /* (Not hubless), not a UV */ + return 0; + + /* Is UV hubless system */ + uv_hubless_system = 0x01; + + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + + /* UV4 Hubless: CH */ + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; + + /* UV3 Hubless: UV300/MC990X w/o hub */ + else + uv_hubless_system |= 0x8; + + /* Copy APIC type */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", + oem_id, oem_table_id, uv_system_type, uv_hubless_system); + return 0; + } + + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + + /* Set hubbed type if true */ + uv_hub_info->hub_revision = + !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0; + + switch (uv_hub_info->hub_revision) { + case UV5_HUB_REVISION_BASE: + uv_hubbed_system = 0x21; + uv_hub_type_set(UV5); + break; + + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + uv_hub_type_set(UV4); + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + uv_hub_type_set(UV3); + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + uv_hub_type_set(UV2); + break; + + default: + return 0; + } + + /* Get UV hub chip part number & revision */ + early_set_hub_type(); + + /* Other UV setup functions */ + early_get_pnodeid(); + early_get_apic_socketid_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; + uv_tsc_check_sync(); + + return 1; +} + +/* Called early to probe for the correct APIC driver */ +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +{ + /* Set up early hub info fields for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + + /* If not UV, return. */ + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) + return 0; + + /* Save and Decode OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + + /* This is the most common hardware variant, x2apic mode */ + if (!strcmp(oem_table_id, "UVX")) + uv_system_type = UV_X2APIC; + + /* Only used for very small systems, usually 1 chassis, legacy mode */ + else if (!strcmp(oem_table_id, "UVL")) + uv_system_type = UV_LEGACY_APIC; + + else + goto badbios; + + pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", + oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), + uv_min_hub_revision_id); + + return 0; + +badbios: + pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); + BUG(); +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} +EXPORT_SYMBOL_GPL(is_uv_system); + +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +static int is_uv_hubless(int uvtype) +{ + return (uv_hubless_system & uvtype); +} + +void **__uv_hub_info_list; +EXPORT_SYMBOL_GPL(__uv_hub_info_list); + +DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info); + +short uv_possible_blades; +EXPORT_SYMBOL_GPL(uv_possible_blades); + +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + +/* The following values are used for the per node hub info struct */ +static __initdata unsigned short *_node_to_pnode; +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; + +static __initdata struct uv_gam_range_s *_gr_table; + +#define SOCK_EMPTY ((unsigned short)~0) + +/* Default UV memory block size is 2GB */ +static unsigned long mem_block_size __initdata = (2UL << 30); + +/* Kernel parameter to specify UV mem block size */ +static int __init parse_mem_block_size(char *ptr) +{ + unsigned long size = memparse(ptr, NULL); + + /* Size will be rounded down by set_block_size() below */ + mem_block_size = size; + return 0; +} +early_param("uv_memblksize", parse_mem_block_size); + +static __init int adj_blksize(u32 lgre) +{ + unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; + unsigned long size; + + for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1) + if (IS_ALIGNED(base, size)) + break; + + if (size >= mem_block_size) + return 0; + + mem_block_size = size; + return 1; +} + +static __init void set_block_size(void) +{ + unsigned int order = ffs(mem_block_size); + + if (order) { + /* adjust for ffs return of 1..64 */ + set_memory_block_size_order(order - 1); + pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size); + } else { + /* bad or zero value, default to 1UL << 31 (2GB) */ + pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size); + set_memory_block_size_order(31); + } +} + +/* Build GAM range lookup table: */ +static __init void build_uv_gr_table(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + struct uv_gam_range_s *grt; + unsigned long last_limit = 0, ram_limit = 0; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; + + if (!gre) + return; + + bytes = _gr_table_len * sizeof(struct uv_gam_range_s); + grt = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!grt); + _gr_table = grt; + + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { + if (!ram_limit) { + /* Mark hole between RAM/non-RAM: */ + ram_limit = last_limit; + last_limit = gre->limit; + lsid++; + continue; + } + last_limit = gre->limit; + pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table)); + continue; + } + if (_max_socket < gre->sockid) { + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table)); + continue; + } + sid = gre->sockid - _min_socket; + if (lsid < sid) { + /* New range: */ + grt = &_gr_table[indx]; + grt->base = lindx; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid = sid; + lindx = indx++; + continue; + } + /* Update range: */ + if (lsid == sid && !ram_limit) { + /* .. if contiguous: */ + if (grt->limit == last_limit) { + grt->limit = last_limit = gre->limit; + continue; + } + } + /* Non-contiguous RAM range: */ + if (!ram_limit) { + grt++; + grt->base = lindx; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + continue; + } + /* Non-contiguous/non-RAM: */ + grt++; + /* base is this entry */ + grt->base = grt - _gr_table; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid++; + } + + /* Shorten table if possible */ + grt++; + i = grt - _gr_table; + if (i < _gr_table_len) { + void *ret; + + bytes = i * sizeof(struct uv_gam_range_s); + ret = krealloc(_gr_table, bytes, GFP_KERNEL); + if (ret) { + _gr_table = ret; + _gr_table_len = i; + } + } + + /* Display resultant GAM range table: */ + for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + unsigned long start, end; + int gb = grt->base; + + start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb); + } +} + +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + unsigned long val; + int pnode; + + pnode = uv_apicid_to_pnode(phys_apicid); + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_INIT; + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_STARTUP; + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + return 0; +} + +static void uv_send_IPI_one(int cpu, int vector) +{ + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; + else + dmode = dest_Fixed; + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_all(int vector) +{ + uv_send_IPI_mask(cpu_online_mask, vector); +} + +static int uv_apic_id_valid(u32 apicid) +{ + return 1; +} + +static int uv_apic_id_registered(void) +{ + return 1; +} + +static void uv_init_apic_ldr(void) +{ +} + +static u32 apic_uv_calc_apicid(unsigned int cpu) +{ + return apic_default_calc_apicid(cpu); +} + +static unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +static u32 set_apic_id(unsigned int id) +{ + return id; +} + +static unsigned int uv_read_apic_id(void) +{ + return x2apic_get_apic_id(apic_read(APIC_ID)); +} + +static int uv_phys_pkg_id(int initial_apicid, int index_msb) +{ + return uv_read_apic_id() >> index_msb; +} + +static void uv_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic apic_x2apic_uv_x __ro_after_init = { + + .name = "UV large system", + .probe = uv_probe, + .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = uv_apic_id_valid, + .apic_id_registered = uv_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* Physical */ + + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + + .init_apic_ldr = uv_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .phys_pkg_id = uv_phys_pkg_id, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = set_apic_id, + + .calc_dest_apicid = apic_uv_calc_apicid, + + .send_IPI = uv_send_IPI_one, + .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, + .send_IPI_allbutself = uv_send_IPI_allbutself, + .send_IPI_all = uv_send_IPI_all, + .send_IPI_self = uv_send_IPI_self, + + .wakeup_secondary_cpu = uv_wakeup_secondary, + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 +#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT + +static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +{ + union uvh_rh_gam_alias_2_overlay_config_u alias; + union uvh_rh_gam_alias_2_redirect_config_u redirect; + unsigned long m_redirect; + unsigned long m_overlay; + int i; + + for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { + switch (i) { + case 0: + m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG; + break; + case 1: + m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG; + break; + case 2: + m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG; + break; + } + alias.v = uv_read_local_mmr(m_overlay); + if (alias.s.enable && alias.s.base == 0) { + *size = (1UL << alias.s.m_alias); + redirect.v = uv_read_local_mmr(m_redirect); + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + return; + } + } + *base = *size = 0; +} + +enum map_type {map_wb, map_uc}; +static const char * const mt[] = { "WB", "UC" }; + +static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); + if (!paddr) { + pr_info("UV: Map %s_HI base address NULL\n", id); + return; + } + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); + + pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n", + id, paddr, paddr + bytes, mt[map_type], max_pnode + 1); +} + +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_u gru; + unsigned long mask, base; + int shift; + + if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else { + pr_err("UV: GRU unavailable (no MMR)\n"); + return; + } + + if (!gru.s.enable) { + pr_info("UV: GRU disabled (by BIOS)\n"); + return; + } + + base = (gru.v & mask) >> shift; + map_high("GRU", base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); +} + +static __init void map_mmr_high(int max_pnode) +{ + unsigned long base; + int shift; + bool enable; + + if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh10_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else { + pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n", + __func__); + return; + } + + if (enable) + map_high("MMR", base, shift, shift, max_pnode, map_uc); + else + pr_info("UV: MMR disabled\n"); +} + +/* Arch specific ENUM cases */ +enum mmioh_arch { + UV2_MMIOH = -1, + UVY_MMIOH0, UVY_MMIOH1, + UVX_MMIOH0, UVX_MMIOH1, +}; + +/* Calculate and Map MMIOH Regions */ +static void __init calc_mmioh_map(enum mmioh_arch index, + int min_pnode, int max_pnode, + int shift, unsigned long base, int m_io, int n_io) +{ + unsigned long mmr, nasid_mask; + int nasid, min_nasid, max_nasid, lnasid, mapped; + int i, fi, li, n, max_io; + char id[8]; + + /* One (UV2) mapping */ + if (index == UV2_MMIOH) { + strncpy(id, "MMIOH", sizeof(id)); + max_io = max_pnode; + mapped = 0; + goto map_exit; + } + + /* small and large MMIOH mappings */ + switch (index) { + case UVY_MMIOH0: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVY_MMIOH1: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVX_MMIOH0: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + case UVX_MMIOH1: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + default: + pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index); + return; + } + + /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */ + snprintf(id, sizeof(id), "MMIOH%d", index%2); + + max_io = lnasid = fi = li = -1; + for (i = 0; i < n; i++) { + unsigned long m_redirect = mmr + i * 8; + unsigned long redirect = uv_read_local_mmr(m_redirect); + + nasid = redirect & nasid_mask; + if (i == 0) + pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", + id, redirect, m_redirect, nasid); + + /* Invalid NASID check */ + if (nasid < min_nasid || max_nasid < nasid) { + pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", + __func__, index, min_nasid, max_nasid); + nasid = -1; + } + + if (nasid == lnasid) { + li = i; + /* Last entry check: */ + if (i != n-1) + continue; + } + + /* Check if we have a cached (or last) redirect to print: */ + if (lnasid != -1 || (i == n-1 && nasid != -1)) { + unsigned long addr1, addr2; + int f, l; + + if (lnasid == -1) { + f = l = i; + lnasid = nasid; + } else { + f = fi; + l = li; + } + addr1 = (base << shift) + f * (1ULL << m_io); + addr2 = (base << shift) + (l + 1) * (1ULL << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); + if (max_io < l) + max_io = l; + } + fi = li = i; + lnasid = nasid; + } + +map_exit: + pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n", + id, base, shift, m_io, max_io, max_pnode); + + if (max_io >= 0 && !mapped) + map_high(id, base, shift, m_io, max_io, map_uc); +} + +static __init void map_mmioh_high(int min_pnode, int max_pnode) +{ + /* UVY flavor */ + if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io); + + mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io); + return; + } + /* UVX flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh0, base, 0); + int m_io = uvxy_field(mmioh0, m_io, 0); + int n_io = uvxy_field(mmioh0, n_io, 0); + + calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + base, m_io, n_io); + } + + mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh1, base, 0); + int m_io = uvxy_field(mmioh1, m_io, 0); + int n_io = uvxy_field(mmioh1, n_io, 0); + + calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + base, m_io, n_io); + } + return; + } + + /* UV2 flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) { + union uvh_rh_gam_mmioh_overlay_config_u mmioh; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG); + if (unlikely(mmioh.s2.enable == 0)) + pr_info("UV: MMIOH disabled\n"); + else + calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode, + UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT, + mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io); + return; + } +} + +static __init void map_low_mmrs(void) +{ + if (UV_GLOBAL_MMR32_BASE) + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + + if (UV_LOCAL_MMR_BASE) + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +static __init void uv_rtc_init(void) +{ + long status; + u64 ticks_per_sec; + + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec); + + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { + pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n"); + + /* BIOS gives wrong value for clock frequency, so guess: */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else { + sn_rtc_cycles_per_second = ticks_per_sec; + } +} + +/* Direct Legacy VGA I/O traffic to designated IOH */ +static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) +{ + int domain, bus, rc; + + if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + + return rc; +} + +/* + * Called on each CPU to initialize the per_cpu UV data area. + * FIXME: hotplug not supported yet + */ +void uv_cpu_init(void) +{ + /* CPU 0 initialization will be done via uv_system_init. */ + if (smp_processor_id() == 0) + return; + + uv_hub_info->nr_online_cpus++; +} + +struct mn { + unsigned char m_val; + unsigned char n_val; + unsigned char m_shift; + unsigned char n_lshift; +}; + +/* Initialize caller's MN struct and fill in values */ +static void get_mn(struct mn *mnp) +{ + memset(mnp, 0, sizeof(*mnp)); + mnp->n_val = uv_cpuid.n_skt; + if (is_uv(UV4|UVY)) { + mnp->m_val = 0; + mnp->n_lshift = 0; + } else if (is_uv3_hub()) { + union uvyh_gr0_gam_gr_config_u m_gr_config; + + mnp->m_val = uv_cpuid.m_skt; + m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; + } else if (is_uv2_hub()) { + mnp->m_val = uv_cpuid.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + } + mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; +} + +static void __init uv_init_hub_info(struct uv_hub_info_s *hi) +{ + struct mn mn; + + get_mn(&mn); + hi->gpa_mask = mn.m_val ? + (1UL << (mn.m_val + mn.n_val)) - 1 : + (1UL << uv_cpuid.gpa_shift) - 1; + + hi->m_val = mn.m_val; + hi->n_val = mn.n_val; + hi->m_shift = mn.m_shift; + hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + hi->hub_revision = uv_hub_info->hub_revision; + hi->hub_type = uv_hub_info->hub_type; + hi->pnode_mask = uv_cpuid.pnode_mask; + hi->nasid_shift = uv_cpuid.nasid_shift; + hi->min_pnode = _min_pnode; + hi->min_socket = _min_socket; + hi->pnode_to_socket = _pnode_to_socket; + hi->socket_to_node = _socket_to_node; + hi->socket_to_pnode = _socket_to_pnode; + hi->gr_table_len = _gr_table_len; + hi->gr_table = _gr_table; + + uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); + hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + if (mn.m_val) + hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; + + if (uv_gp_table) { + hi->global_mmr_base = uv_gp_table->mmr_base; + hi->global_mmr_shift = uv_gp_table->mmr_shift; + hi->global_gru_base = uv_gp_table->gru_base; + hi->global_gru_shift = uv_gp_table->gru_shift; + hi->gpa_shift = uv_gp_table->gpa_shift; + hi->gpa_mask = (1UL << hi->gpa_shift) - 1; + } else { + hi->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) & + ~UV_MMR_ENABLE; + hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + } + + get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top); + + hi->apic_pnode_shift = uv_cpuid.socketid_shift; + + /* Show system specific info: */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift); + if (hi->global_gru_base) + pr_info("UV: gru_base/shift:0x%lx/%ld\n", + hi->global_gru_base, hi->global_gru_shift); + + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); +} + +static void __init decode_gam_params(unsigned long ptr) +{ + uv_gp_table = (struct uv_gam_parameters *)ptr; + + pr_info("UV: GAM Params...\n"); + pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n", + uv_gp_table->mmr_base, uv_gp_table->mmr_shift, + uv_gp_table->gru_base, uv_gp_table->gru_shift, + uv_gp_table->gpa_shift); +} + +static void __init decode_gam_rng_tbl(unsigned long ptr) +{ + struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; + unsigned long lgre = 0; + int index = 0; + int sock_min = 999999, pnode_min = 99999; + int sock_max = -1, pnode_max = -1; + + uv_gre_table = gre; + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + unsigned long size = ((unsigned long)(gre->limit - lgre) + << UV_GAM_RANGE_SHFT); + int order = 0; + char suffix[] = " KMGTPE"; + int flag = ' '; + + while (size > 9999 && order < sizeof(suffix)) { + size /= 1024; + order++; + } + + /* adjust max block size to current range start */ + if (gre->type == 1 || gre->type == 2) + if (adj_blksize(lgre)) + flag = '*'; + + if (!index) { + pr_info("UV: GAM Range Table...\n"); + pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); + } + pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n", + index++, + (unsigned long)lgre << UV_GAM_RANGE_SHFT, + (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, + flag, size, suffix[order], + gre->type, gre->nasid, gre->sockid, gre->pnode); + + /* update to next range start */ + lgre = gre->limit; + if (sock_min > gre->sockid) + sock_min = gre->sockid; + if (sock_max < gre->sockid) + sock_max = gre->sockid; + if (pnode_min > gre->pnode) + pnode_min = gre->pnode; + if (pnode_max < gre->pnode) + pnode_max = gre->pnode; + } + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); +} + +/* Walk through UVsystab decoding the fields */ +static int __init decode_uv_systab(void) +{ + struct uv_systab *st; + int i; + + /* Get mapped UVsystab pointer */ + st = uv_systab; + + /* If UVsystab is version 1, there is no extended UVsystab */ + if (st && st->revision == UV_SYSTAB_VERSION_1) + return 0; + + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { + int rev = st ? st->revision : 0; + + pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n", + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Does not support UV, switch to non-UV x86_64\n"); + uv_system_type = UV_NONE; + + return -EINVAL; + } + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + + /* point to payload */ + ptr += (unsigned long)st; + + switch (st->entry[i].type) { + case UV_SYSTAB_TYPE_GAM_PARAMS: + decode_gam_params(ptr); + break; + + case UV_SYSTAB_TYPE_GAM_RNG_TBL: + decode_gam_rng_tbl(ptr); + break; + + case UV_SYSTAB_TYPE_ARCH_TYPE: + /* already processed in early startup */ + break; + + default: + pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n", + __func__, st->entry[i].type); + break; + } + } + return 0; +} + +/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ +static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) +{ + unsigned long np; + int i, uv_pb = 0; + + if (UVH_NODE_PRESENT_TABLE) { + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", + UVH_NODE_PRESENT_TABLE_DEPTH); + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); + uv_pb += hweight64(np); + } + } + if (UVH_NODE_PRESENT_0) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_0); + pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); + uv_pb += hweight64(np); + } + if (UVH_NODE_PRESENT_1) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_1); + pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); + uv_pb += hweight64(np); + } + if (uv_possible_blades != uv_pb) + uv_possible_blades = uv_pb; + + pr_info("UV: number nodes/possible blades %d\n", uv_pb); +} + +static void __init build_socket_tables(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + int num, nump; + int cpu, i, lnid; + int minsock = _min_socket; + int maxsock = _max_socket; + int minpnode = _min_pnode; + int maxpnode = _max_pnode; + size_t bytes; + + if (!gre) { + if (is_uv2_hub() || is_uv3_hub()) { + pr_info("UV: No UVsystab socket table, ignoring\n"); + return; + } + pr_err("UV: Error: UVsystab address translations not available!\n"); + BUG(); + } + + /* Build socket id -> node id, pnode */ + num = maxsock - minsock + 1; + bytes = num * sizeof(_socket_to_node[0]); + _socket_to_node = kmalloc(bytes, GFP_KERNEL); + _socket_to_pnode = kmalloc(bytes, GFP_KERNEL); + + nump = maxpnode - minpnode + 1; + bytes = nump * sizeof(_pnode_to_socket[0]); + _pnode_to_socket = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket); + + for (i = 0; i < num; i++) + _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY; + + for (i = 0; i < nump; i++) + _pnode_to_socket[i] = SOCK_EMPTY; + + /* Fill in pnode/node/addr conversion list values: */ + pr_info("UV: GAM Building socket/pnode conversion tables\n"); + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + continue; + i = gre->sockid - minsock; + /* Duplicate: */ + if (_socket_to_pnode[i] != SOCK_EMPTY) + continue; + _socket_to_pnode[i] = gre->pnode; + + i = gre->pnode - minpnode; + _pnode_to_socket[i] = gre->sockid; + + pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", + gre->sockid, gre->type, gre->nasid, + _socket_to_pnode[gre->sockid - minsock], + _pnode_to_socket[gre->pnode - minpnode]); + } + + /* Set socket -> node values: */ + lnid = NUMA_NO_NODE; + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int apicid, sockid; + + if (lnid == nid) + continue; + lnid = nid; + apicid = per_cpu(x86_cpu_to_apicid, cpu); + sockid = apicid >> uv_cpuid.socketid_shift; + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); + } + + /* Set up physical blade to pnode translation from GAM Range Table: */ + bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); + _node_to_pnode = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_node_to_pnode); + + for (lnid = 0; lnid < num_possible_nodes(); lnid++) { + unsigned short sockid; + + for (sockid = minsock; sockid <= maxsock; sockid++) { + if (lnid == _socket_to_node[sockid - minsock]) { + _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock]; + break; + } + } + if (sockid > maxsock) { + pr_err("UV: socket for node %d not found!\n", lnid); + BUG(); + } + } + + /* + * If socket id == pnode or socket id == node for all nodes, + * system runs faster by removing corresponding conversion table. + */ + pr_info("UV: Checking socket->node/pnode for identity maps\n"); + if (minsock == 0) { + for (i = 0; i < num; i++) + if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i]) + break; + if (i >= num) { + kfree(_socket_to_node); + _socket_to_node = NULL; + pr_info("UV: 1:1 socket_to_node table removed\n"); + } + } + if (minsock == minpnode) { + for (i = 0; i < num; i++) + if (_socket_to_pnode[i] != SOCK_EMPTY && + _socket_to_pnode[i] != i + minpnode) + break; + if (i >= num) { + kfree(_socket_to_pnode); + _socket_to_pnode = NULL; + pr_info("UV: 1:1 socket_to_pnode table removed\n"); + } + } +} + +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* Setup user proc fs files */ +static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) +{ + seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); + return 0; +} + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create_single("archtype", 0, pde, proc_archtype_show); + if (hubless) + proc_create_single("hubless", 0, pde, proc_hubless_show); + else + proc_create_single("hubbed", 0, pde, proc_hubbed_show); +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Set section block size for current node memory */ + set_block_size(); + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + +static void __init uv_system_init_hub(void) +{ + struct uv_hub_info_s hub_info = {0}; + int bytes, cpu, nodeid; + unsigned short min_pnode = 9999, max_pnode = 0; + char *hub = is_uv5_hub() ? "UV500" : + is_uv4_hub() ? "UV400" : + is_uv3_hub() ? "UV300" : + is_uv2_hub() ? "UV2000/3000" : NULL; + + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } + pr_info("UV: Found %s hub\n", hub); + + map_low_mmrs(); + + /* Get uv_systab for decoding, setup UV BIOS calls */ + uv_bios_init(); + + /* If there's an UVsystab problem then abort UV init: */ + if (decode_uv_systab() < 0) { + pr_err("UV: Mangled UVsystab format\n"); + return; + } + + build_socket_tables(); + build_uv_gr_table(); + set_block_size(); + uv_init_hub_info(&hub_info); + uv_possible_blades = num_possible_nodes(); + if (!_node_to_pnode) + boot_init_possible_blades(&hub_info); + + /* uv_num_possible_blades() is really the hub count: */ + pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus()); + + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); + hub_info.coherency_domain_number = sn_coherency_id; + uv_rtc_init(); + + bytes = sizeof(void *) * uv_num_possible_blades(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!__uv_hub_info_list); + + bytes = sizeof(struct uv_hub_info_s); + for_each_node(nodeid) { + struct uv_hub_info_s *new_hub; + + if (__uv_hub_info_list[nodeid]) { + pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid); + BUG(); + } + + /* Allocate new per hub info list */ + new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid); + BUG_ON(!new_hub); + __uv_hub_info_list[nodeid] = new_hub; + new_hub = uv_hub_info_list(nodeid); + BUG_ON(!new_hub); + *new_hub = hub_info; + + /* Use information from GAM table if available: */ + if (_node_to_pnode) + new_hub->pnode = _node_to_pnode[nodeid]; + else /* Or fill in during CPU loop: */ + new_hub->pnode = 0xffff; + + new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); + new_hub->memory_nid = NUMA_NO_NODE; + new_hub->nr_possible_cpus = 0; + new_hub->nr_online_cpus = 0; + } + + /* Initialize per CPU info: */ + for_each_possible_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + int numa_node_id; + unsigned short pnode; + + nodeid = cpu_to_node(cpu); + numa_node_id = numa_cpu_node(cpu); + pnode = uv_apicid_to_pnode(apicid); + + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); + uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; + if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) + uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); + + /* Init memoryless node: */ + if (nodeid != numa_node_id && + uv_hub_info_list(numa_node_id)->pnode == 0xffff) + uv_hub_info_list(numa_node_id)->pnode = pnode; + else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + uv_cpu_hub_info(cpu)->pnode = pnode; + } + + for_each_node(nodeid) { + unsigned short pnode = uv_hub_info_list(nodeid)->pnode; + + /* Add pnode info for pre-GAM list nodes without CPUs: */ + if (pnode == 0xffff) { + unsigned long paddr; + + paddr = node_start_pfn(nodeid) << PAGE_SHIFT; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); + uv_hub_info_list(nodeid)->pnode = pnode; + } + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); + pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n", + nodeid, + uv_hub_info_list(nodeid)->pnode, + uv_hub_info_list(nodeid)->nr_possible_cpus); + } + + pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_mmioh_high(min_pnode, max_pnode); + + uv_nmi_setup(); + uv_cpu_init(); + uv_setup_proc_files(0); + + /* Register Legacy VGA I/O redirection handler: */ + pci_register_set_vga_state(uv_set_vga_state); + + check_efi_reboot(); +} + +/* + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). + */ +void __init uv_system_init(void) +{ + if (likely(!is_uv_system() && !is_uv_hubless(1))) + return; + + if (is_uv_system()) + uv_system_init_hub(); + else + uv_system_init_hubless(); +} + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c new file mode 100644 index 000000000..166d9991e --- /dev/null +++ b/arch/x86/kernel/apm_32.c @@ -0,0 +1,2432 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- + * APM BIOS driver for Linux + * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) + * + * Initial development of this driver was funded by NEC Australia P/L + * and NEC Corporation + * + * October 1995, Rik Faith (faith@cs.unc.edu): + * Minor enhancements and updates (to the patch set) for 1.3.x + * Documentation + * January 1996, Rik Faith (faith@cs.unc.edu): + * Make /proc/apm easy to format (bump driver version) + * March 1996, Rik Faith (faith@cs.unc.edu): + * Prohibit APM BIOS calls unless apm_enabled. + * (Thanks to Ulrich Windl ) + * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) + * Version 1.0 and 1.1 + * May 1996, Version 1.2 + * Feb 1998, Version 1.3 + * Feb 1998, Version 1.4 + * Aug 1998, Version 1.5 + * Sep 1998, Version 1.6 + * Nov 1998, Version 1.7 + * Jan 1999, Version 1.8 + * Jan 1999, Version 1.9 + * Oct 1999, Version 1.10 + * Nov 1999, Version 1.11 + * Jan 2000, Version 1.12 + * Feb 2000, Version 1.13 + * Nov 2000, Version 1.14 + * Oct 2001, Version 1.15 + * Jan 2002, Version 1.16 + * Oct 2002, Version 1.16ac + * + * History: + * 0.6b: first version in official kernel, Linux 1.3.46 + * 0.7: changed /proc/apm format, Linux 1.3.58 + * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 + * 0.9: only call bios if bios is present, Linux 1.3.72 + * 1.0: use fixed device number, consolidate /proc/apm into this file, + * Linux 1.3.85 + * 1.1: support user-space standby and suspend, power off after system + * halted, Linux 1.3.98 + * 1.2: When resetting RTC after resume, take care so that the time + * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth + * ); improve interaction between + * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 + * 1.2a:Simple change to stop mysterious bug reports with SMP also added + * levels to the printk calls. APM is not defined for SMP machines. + * The new replacement for it is, but Linux doesn't yet support this. + * Alan Cox Linux 2.1.55 + * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's + * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by + * Dean Gaudet . + * C. Scott Ananian Linux 2.1.87 + * 1.5: Fix segment register reloading (in case of bad segments saved + * across BIOS call). + * Stephen Rothwell + * 1.6: Cope with compiler/assembler differences. + * Only try to turn off the first display device. + * Fix OOPS at power off with no APM BIOS by Jan Echternach + * + * Stephen Rothwell + * 1.7: Modify driver's cached copy of the disabled/disengaged flags + * to reflect current state of APM BIOS. + * Chris Rankin + * Reset interrupt 0 timer to 100Hz after suspend + * Chad Miller + * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE + * Richard Gooch + * Allow boot time disabling of APM + * Make boot messages far less verbose by default + * Make asm safer + * Stephen Rothwell + * 1.8: Add CONFIG_APM_RTC_IS_GMT + * Richard Gooch + * change APM_NOINTS to CONFIG_APM_ALLOW_INTS + * remove dependency on CONFIG_PROC_FS + * Stephen Rothwell + * 1.9: Fix small typo. + * Try to cope with BIOS's that need to have all display + * devices blanked and not just the first one. + * Ross Paterson + * Fix segment limit setting it has always been wrong as + * the segments needed to have byte granularity. + * Mark a few things __init. + * Add hack to allow power off of SMP systems by popular request. + * Use CONFIG_SMP instead of __SMP__ + * Ignore BOUNCES for three seconds. + * Stephen Rothwell + * 1.10: Fix for Thinkpad return code. + * Merge 2.2 and 2.3 drivers. + * Remove APM dependencies in arch/i386/kernel/process.c + * Remove APM dependencies in drivers/char/sysrq.c + * Reset time across standby. + * Allow more inititialisation on SMP. + * Remove CONFIG_APM_POWER_OFF and make it boot time + * configurable (default on). + * Make debug only a boot time parameter (remove APM_DEBUG). + * Try to blank all devices on any error. + * 1.11: Remove APM dependencies in drivers/char/console.c + * Check nr_running to detect if we are idle (from + * Borislav Deianov ) + * Fix for bioses that don't zero the top part of the + * entrypoint offset (Mario Sitta ) + * (reported by Panos Katsaloulis ). + * Real mode power off patch (Walter Hofmann + * ). + * 1.12: Remove CONFIG_SMP as the compiler will optimize + * the code away anyway (smp_num_cpus == 1 in UP) + * noted by Artur Skawina . + * Make power off under SMP work again. + * Fix thinko with initial engaging of BIOS. + * Make sure power off only happens on CPU 0 + * (Paul "Rusty" Russell ). + * Do error notification to user mode if BIOS calls fail. + * Move entrypoint offset fix to ...boot/setup.S + * where it belongs (Cosmos ). + * Remove smp-power-off. SMP users must now specify + * "apm=power-off" on the kernel command line. Suggested + * by Jim Avera , modified by Alan Cox + * . + * Register the /proc/apm entry even on SMP so that + * scripts that check for it before doing power off + * work (Jim Avera ). + * 1.13: Changes for new pm_ interfaces (Andy Henroid + * ). + * Modularize the code. + * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS + * is now the way life works). + * Fix thinko in suspend() (wrong return). + * Notify drivers on critical suspend. + * Make kapmd absorb more idle time (Pavel Machek + * modified by sfr). + * Disable interrupts while we are suspended (Andy Henroid + * fixed by sfr). + * Make power off work on SMP again (Tony Hoyle + * and ) modified by sfr. + * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore + * interval is now configurable. + * 1.14: Make connection version persist across module unload/load. + * Enable and engage power management earlier. + * Disengage power management on module unload. + * Changed to use the sysrq-register hack for registering the + * power off function called by magic sysrq based upon discussions + * in irc://irc.openprojects.net/#kernelnewbies + * (Crutcher Dunnavant ). + * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. + * (Arjan van de Ven ) modified by sfr. + * Work around byte swap bug in one of the Vaio's BIOS's + * (Marc Boucher ). + * Exposed the disable flag to dmi so that we can handle known + * broken APM (Alan Cox ). + * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin + * calling it - instead idle. (Alan Cox ) + * If an APM idle fails log it and idle sensibly + * 1.15: Don't queue events to clients who open the device O_WRONLY. + * Don't expect replies from clients who open the device O_RDONLY. + * (Idea from Thomas Hood) + * Minor waitqueue cleanups. (John Fremlin ) + * 1.16: Fix idle calling. (Andreas Steinmetz et al.) + * Notify listeners of standby or suspend events before notifying + * drivers. Return EBUSY to ioctl() if suspend is rejected. + * (Russell King and Thomas Hood) + * Ignore first resume after we generate our own resume event + * after a suspend (Thomas Hood) + * Daemonize now gets rid of our controlling terminal (sfr). + * CONFIG_APM_CPU_IDLE now just affects the default value of + * idle_threshold (sfr). + * Change name of kernel apm daemon (as it no longer idles) (sfr). + * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we + * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. + * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. + * + * APM 1.1 Reference: + * + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.1, September 1993. + * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. + * + * [This document is available free from Intel by calling 800.628.8686 (fax + * 916.356.6100) or 800.548.4725; or from + * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also + * available from Microsoft by calling 206.882.8080.] + * + * APM 1.2 Reference: + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.2, February 1996. + * + * [This document is available from Microsoft at: + * http://www.microsoft.com/whdc/archive/amp_12.mspx] + */ + +#define pr_fmt(fmt) "apm: " fmt + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) +extern int (*console_blank_hook)(int); +#endif + +/* + * Various options can be changed at boot time as follows: + * (We allow underscores for compatibility with the modules code) + * apm=on/off enable/disable APM + * [no-]allow[-_]ints allow interrupts during BIOS calls + * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call + * [no-]realmode[-_]power[-_]off switch to real mode before + * powering off + * [no-]debug log some debugging messages + * [no-]power[-_]off power off on shutdown + * [no-]smp Use apm even on an SMP box + * bounce[-_]interval= number of ticks to ignore suspend + * bounces + * idle[-_]threshold= System idle percentage above which to + * make APM BIOS idle calls. Set it to + * 100 to disable. + * idle[-_]period= Period (in 1/100s of a second) over + * which the idle percentage is + * calculated. + */ + +/* KNOWN PROBLEM MACHINES: + * + * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant + * [Confirmed by TI representative] + * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification + * [Confirmed by BIOS disassembly] + * [This may work now ...] + * P: Toshiba 1950S: battery life information only gets updated after resume + * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking + * broken in BIOS [Reported by Garst R. Reese ] + * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP + * Neale Banks December 2000 + * + * Legend: U = unusable with APM patches + * P = partially usable with APM patches + */ + +/* + * Define as 1 to make the driver always call the APM BIOS busy + * routine even if the clock was not reported as slowed by the + * idle routine. Otherwise, define as 0. + */ +#define ALWAYS_CALL_BUSY 1 + +/* + * Define to make the APM BIOS calls zero all data segment registers (so + * that an incorrect BIOS implementation will cause a kernel panic if it + * tries to write to arbitrary memory). + */ +#define APM_ZERO_SEGS + +#include + +/* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. + * This patched by Chad Miller , original code by + * David Chen + */ +#undef INIT_TIMER_AFTER_SUSPEND + +#ifdef INIT_TIMER_AFTER_SUSPEND +#include +#include +#include +#endif + +/* + * Need to poll the APM BIOS every second + */ +#define APM_CHECK_TIMEOUT (HZ) + +/* + * Ignore suspend events for this amount of time after a resume + */ +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) + +/* + * Maximum number of events stored + */ +#define APM_MAX_EVENTS 20 + +/* + * The per-file APM data + */ +struct apm_user { + int magic; + struct apm_user *next; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; + int suspend_result; + int suspends_pending; + int standbys_pending; + int suspends_read; + int standbys_read; + int event_head; + int event_tail; + apm_event_t events[APM_MAX_EVENTS]; +}; + +/* + * The magic number in apm_user + */ +#define APM_BIOS_MAGIC 0x4101 + +/* + * idle percentage above which bios idle calls are done + */ +#ifdef CONFIG_APM_CPU_IDLE +#define DEFAULT_IDLE_THRESHOLD 95 +#else +#define DEFAULT_IDLE_THRESHOLD 100 +#endif +#define DEFAULT_IDLE_PERIOD (100 / 3) + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + +/* + * Local variables + */ +__visible struct { + unsigned long offset; + unsigned short segment; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static bool debug __read_mostly; +static bool smp __read_mostly; +static int apm_disabled = -1; +#ifdef CONFIG_SMP +static bool power_off; +#else +static bool power_off = 1; +#endif +static bool realmode_power_off; +#ifdef CONFIG_APM_ALLOW_INTS +static bool allow_ints = 1; +#else +static bool allow_ints; +#endif +static bool broken_psr; + +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); +static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); +static struct apm_user *user_list; +static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); + +static const char driver_version[] = "1.16ac"; /* no spaces */ + +static struct task_struct *kapmd_task; + +/* + * APM event names taken from the APM 1.2 specification. These are + * the message codes that the BIOS uses to tell us about events + */ +static const char * const apm_event_name[] = { + "system standby", + "system suspend", + "normal resume", + "critical resume", + "low battery", + "power status change", + "update time", + "critical suspend", + "user standby", + "user suspend", + "system standby resume", + "capabilities change" +}; +#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name) + +typedef struct lookup_t { + int key; + char *msg; +} lookup_t; + +/* + * The BIOS returns a set of standard error codes in AX when the + * carry flag is set. + */ + +static const lookup_t error_table[] = { +/* N/A { APM_SUCCESS, "Operation succeeded" }, */ + { APM_DISABLED, "Power management disabled" }, + { APM_CONNECTED, "Real mode interface already connected" }, + { APM_NOT_CONNECTED, "Interface not connected" }, + { APM_16_CONNECTED, "16 bit interface already connected" }, +/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ + { APM_32_CONNECTED, "32 bit interface already connected" }, + { APM_32_UNSUPPORTED, "32 bit interface not supported" }, + { APM_BAD_DEVICE, "Unrecognized device ID" }, + { APM_BAD_PARAM, "Parameter out of range" }, + { APM_NOT_ENGAGED, "Interface not engaged" }, + { APM_BAD_FUNCTION, "Function not supported" }, + { APM_RESUME_DISABLED, "Resume timer disabled" }, + { APM_BAD_STATE, "Unable to enter requested state" }, +/* N/A { APM_NO_EVENTS, "No events pending" }, */ + { APM_NO_ERROR, "BIOS did not set a return code" }, + { APM_NOT_PRESENT, "No APM present" } +}; +#define ERROR_COUNT ARRAY_SIZE(error_table) + +/** + * apm_error - display an APM error + * @str: information string + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of + * an APM error. Note that this also handles (negative) kernel errors. + */ + +static void apm_error(char *str, int err) +{ + int i; + + for (i = 0; i < ERROR_COUNT; i++) + if (error_table[i].key == err) + break; + if (i < ERROR_COUNT) + pr_notice("%s: %s\n", str, error_table[i].msg); + else if (err < 0) + pr_notice("%s: linux error code %i\n", str, err); + else + pr_notice("%s: unknown error code %#2.2x\n", + str, err); +} + +/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) + * are saved and zeroed this means that if the BIOS tries to reference + * any data without explicitly loading the segment registers, the kernel + * will fault immediately rather than have some unforeseen circumstances + * for the rest of the kernel. And it will be very obvious! :-) Doing + * this depends on CS referring to the same physical memory as DS so that + * DS can be zeroed before the call. Unfortunately, we can't do anything + * about the stack segment/pointer. Also, we tell the compiler that + * everything could change. + * + * Also, we KNOW that for the non error case of apm_bios_call, there + * is no useful data returned in the low order 8 bits of eax. + */ + +static inline unsigned long __apm_irq_save(void) +{ + unsigned long flags; + local_save_flags(flags); + if (apm_info.allow_ints) { + if (irqs_disabled_flags(flags)) + local_irq_enable(); + } else + local_irq_disable(); + + return flags; +} + +#define apm_irq_save(flags) \ + do { flags = __apm_irq_save(); } while (0) + +static inline void apm_irq_restore(unsigned long flags) +{ + if (irqs_disabled_flags(flags)) + local_irq_disable(); + else if (irqs_disabled()) + local_irq_enable(); +} + +#ifdef APM_ZERO_SEGS +# define APM_DECL_SEGS \ + unsigned int saved_fs; unsigned int saved_gs; +# define APM_DO_SAVE_SEGS \ + savesegment(fs, saved_fs); savesegment(gs, saved_gs) +# define APM_DO_RESTORE_SEGS \ + loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) +#else +# define APM_DECL_SEGS +# define APM_DO_SAVE_SEGS +# define APM_DO_RESTORE_SEGS +#endif + +struct apm_bios_call { + u32 func; + /* In and out */ + u32 ebx; + u32 ecx; + /* Out only */ + u32 eax; + u32 edx; + u32 esi; + + /* Error: -ENOMEM, or bits 8-15 of eax */ + int err; +}; + +/** + * __apm_bios_call - Make an APM BIOS 32bit call + * @_call: pointer to struct apm_bios_call. + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and + * enabled. This call can disable interrupts for a long period of + * time on some laptops. The return value is in AH and the carry + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call(void *_call) +{ + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_rw(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); + APM_DO_SAVE_SEGS; + apm_bios_call_asm(call->func, call->ebx, call->ecx, + &call->eax, &call->ebx, &call->ecx, &call->edx, + &call->esi); + APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + + return call->eax & 0xff; +} + +/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */ +static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call) +{ + int ret; + + /* Don't bother with work_on_cpu in the common case, so we don't + * have to worry about OOM or overhead. */ + if (get_cpu() == 0) { + ret = fn(call); + put_cpu(); + } else { + put_cpu(); + ret = work_on_cpu(0, fn, call); + } + + /* work_on_cpu can fail with -ENOMEM */ + if (ret < 0) + call->err = ret; + else + call->err = (call->eax >> 8) & 0xff; + + return ret; +} + +/** + * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0) + * @call: the apm_bios_call registers. + * + * If there is an error, it is returned in @call.err. + */ +static int apm_bios_call(struct apm_bios_call *call) +{ + return on_cpu0(__apm_bios_call, call); +} + +/** + * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0) + * @_call: pointer to struct apm_bios_call. + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in AH + * (bits 8-15 of eax) and this function returns non-zero (it can + * also return -ENOMEM). This is used for simpler BIOS operations. + * This call may hold interrupts off for a long time on some laptops. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call_simple(void *_call) +{ + u8 error; + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_rw(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); + APM_DO_SAVE_SEGS; + error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, + &call->eax); + APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + return error; +} + +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * @err: bits + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in @err + * and this function returns non-zero. This is used for simpler + * BIOS operations. This call may hold interrupts off for a long + * time on some laptops. + */ +static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, + int *err) +{ + struct apm_bios_call call; + int ret; + + call.func = func; + call.ebx = ebx_in; + call.ecx = ecx_in; + + ret = on_cpu0(__apm_bios_call_simple, &call); + *eax = call.eax; + *err = call.err; + return ret; +} + +/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * + * Retrieve the APM version supported by the BIOS. This is only + * supported for APM 1.1 or higher. An error indicates APM 1.0 is + * probably present. + * + * On entry val should point to a value indicating the APM driver + * version with the high byte being the major and the low byte the + * minor number both in BCD + * + * On return it will hold the BIOS revision supported in the + * same format. + */ + +static int apm_driver_version(u_short *val) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err)) + return err; + *val = eax; + return APM_SUCCESS; +} + +/** + * apm_get_event - get an APM event from the BIOS + * @event: pointer to the event + * @info: point to the event information + * + * The APM BIOS provides a polled information for event + * reporting. The BIOS expects to be polled at least every second + * when events are pending. When a message is found the caller should + * poll until no more messages are present. However, this causes + * problems on some laptops where a suspend event notification is + * not cleared until it is acknowledged. + * + * Additional information is returned in the info pointer, providing + * that APM 1.2 is in use. If no messges are pending the value 0x80 + * is returned (No power management events pending). + */ +static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_EVENT; + call.ebx = call.ecx = 0; + + if (apm_bios_call(&call)) + return call.err; + + *event = call.ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else + *info = call.ecx; + return APM_SUCCESS; +} + +/** + * set_power_state - set the power management state + * @what: which items to transition + * @state: state to transition to + * + * Request an APM change of state for one or more system devices. The + * processor state must be transitioned last of all. what holds the + * class of device in the upper byte and the device number (0xFF for + * all) for the object to be transitioned. + * + * The state holds the state to transition to, which may in fact + * be an acceptance of a BIOS requested state change. + */ + +static int set_power_state(u_short what, u_short state) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err)) + return err; + return APM_SUCCESS; +} + +/** + * set_system_power_state - set system wide power state + * @state: which state to enter + * + * Transition the entire system into a new APM power state. + */ + +static int set_system_power_state(u_short state) +{ + return set_power_state(APM_DEVICE_ALL, state); +} + +/** + * apm_do_idle - perform power saving + * + * This function notifies the BIOS that the processor is (in the view + * of the OS) idle. It returns -1 in the event that the BIOS refuses + * to handle the idle request. On a success the function returns 1 + * if the BIOS did clock slowing or 0 otherwise. + */ + +static int apm_do_idle(void) +{ + u32 eax; + u8 ret = 0; + int idled = 0; + int err = 0; + + if (!need_resched()) { + idled = 1; + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); + } + + if (!idled) + return 0; + + if (ret) { + static unsigned long t; + + /* This always fails on some SMP boards running UP kernels. + * Only report the failure the first 5 times. + */ + if (++t < 5) { + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err); + t = jiffies; + } + return -1; + } + clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; + return clock_slowed; +} + +/** + * apm_do_busy - inform the BIOS the CPU is busy + * + * Request that the BIOS brings the CPU back to full performance. + */ + +static void apm_do_busy(void) +{ + u32 dummy; + int err; + + if (clock_slowed || ALWAYS_CALL_BUSY) { + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err); + clock_slowed = 0; + } +} + +/* + * If no process has really been interested in + * the CPU for some time, we want to call BIOS + * power management - we probably want + * to conserve power. + */ +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 + +/** + * apm_cpu_idle - cpu idling for APM capable Linux + * + * This is the idling function the kernel executes when APM is available. It + * tries to do BIOS powermanagement based on the average system idle time. + * Furthermore it calls the system default idle routine. + */ + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + static int use_apm_idle; /* = 0 */ + static unsigned int last_jiffies; /* = 0 */ + static u64 last_stime; /* = 0 */ + u64 stime, utime; + + int apm_idle_done = 0; + unsigned int jiffies_since_last_check = jiffies - last_jiffies; + unsigned int bucket; + +recalc: + task_cputime(current, &utime, &stime); + if (jiffies_since_last_check > IDLE_CALC_LIMIT) { + use_apm_idle = 0; + } else if (jiffies_since_last_check > idle_period) { + unsigned int idle_percentage; + + idle_percentage = nsecs_to_jiffies(stime - last_stime); + idle_percentage *= 100; + idle_percentage /= jiffies_since_last_check; + use_apm_idle = (idle_percentage > idle_threshold); + if (apm_info.forbid_idle) + use_apm_idle = 0; + } + + last_jiffies = jiffies; + last_stime = stime; + + bucket = IDLE_LEAKY_MAX; + + while (!need_resched()) { + if (use_apm_idle) { + unsigned int t; + + t = jiffies; + switch (apm_do_idle()) { + case 0: + apm_idle_done = 1; + if (t != jiffies) { + if (bucket) { + bucket = IDLE_LEAKY_MAX; + continue; + } + } else if (bucket) { + bucket--; + continue; + } + break; + case 1: + apm_idle_done = 1; + break; + default: /* BIOS refused */ + break; + } + } + default_idle(); + local_irq_disable(); + jiffies_since_last_check = jiffies - last_jiffies; + if (jiffies_since_last_check > idle_period) + goto recalc; + } + + if (apm_idle_done) + apm_do_busy(); + + return index; +} + +/** + * apm_power_off - ask the BIOS to power off + * + * Handle the power off sequence. This is the one piece of code we + * will execute even on SMP machines. In order to deal with BIOS + * bugs we support real mode APM BIOS power off calls. We also make + * the SMP call on CPU0 as some systems will only honour this call + * on their first cpu. + */ + +static void apm_power_off(void) +{ + /* Some bioses don't like being called from CPU != 0 */ + if (apm_info.realmode_power_off) { + set_cpus_allowed_ptr(current, cpumask_of(0)); + machine_real_restart(MRR_APM); + } else { + (void)set_system_power_state(APM_STATE_OFF); + } +} + +#ifdef CONFIG_APM_DO_ENABLE + +/** + * apm_enable_power_management - enable BIOS APM power management + * @enable: enable yes/no + * + * Enable or disable the APM BIOS power services. + */ + +static int apm_enable_power_management(int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, + enable, &eax, &err)) + return err; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else + apm_info.bios.flags |= APM_BIOS_DISABLED; + return APM_SUCCESS; +} +#endif + +/** + * apm_get_power_status - get current power state + * @status: returned status + * @bat: battery info + * @life: estimated life + * + * Obtain the current power status from the APM BIOS. We return a + * status which gives the rough battery status, and current power + * source. The bat value returned give an estimate as a percentage + * of life and a status value for the battery. The estimated life + * if reported is a lifetime in secodnds/minutes at current powwer + * consumption. + */ + +static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_STATUS; + call.ebx = APM_DEVICE_ALL; + call.ecx = 0; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; + if (apm_bios_call(&call)) { + if (!call.err) + return APM_NO_ERROR; + return call.err; + } + *status = call.ebx; + *bat = call.ecx; + if (apm_info.get_power_status_swabinminutes) { + *life = swab16((u16)call.edx); + *life |= 0x8000; + } else + *life = call.edx; + return APM_SUCCESS; +} + +#if 0 +static int apm_get_battery_status(u_short which, u_short *status, + u_short *bat, u_short *life, u_short *nbat) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + + if (apm_info.connection_version < 0x0102) { + /* pretend we only have one battery. */ + if (which != 1) + return APM_BAD_DEVICE; + *nbat = 1; + return apm_get_power_status(status, bat, life); + } + + if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, + &ebx, &ecx, &edx, &esi)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + *life = edx; + *nbat = esi; + return APM_SUCCESS; +} +#endif + +/** + * apm_engage_power_management - enable PM on a device + * @device: identity of device + * @enable: on/off + * + * Activate or deactivate power management on either a specific device + * or the entire system (%APM_DEVICE_ALL). + */ + +static int apm_engage_power_management(u_short device, int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, + &eax, &err)) + return err; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; + else + apm_info.bios.flags |= APM_BIOS_DISENGAGED; + } + return APM_SUCCESS; +} + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + +/** + * apm_console_blank - blank the display + * @blank: on/off + * + * Attempt to blank the console, firstly by blanking just video device + * zero, and if that fails (some BIOSes don't support it) then it blanks + * all video devices. Typically the BIOS will do laptop backlight and + * monitor powerdown for us. + */ + +static int apm_console_blank(int blank) +{ + int error = APM_NOT_ENGAGED; /* silence gcc */ + int i; + u_short state; + static const u_short dev[3] = { 0x100, 0x1FF, 0x101 }; + + state = blank ? APM_STATE_STANDBY : APM_STATE_READY; + + for (i = 0; i < ARRAY_SIZE(dev); i++) { + error = set_power_state(dev[i], state); + + if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) + return 1; + + if (error == APM_NOT_ENGAGED) + break; + } + + if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { + eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (eng_error) { + apm_error("set display", error); + apm_error("engage interface", eng_error); + return 0; + } else + return apm_console_blank(blank); + } + } + apm_error("set display", error); + return 0; +} +#endif + +static int queue_empty(struct apm_user *as) +{ + return as->event_head == as->event_tail; +} + +static apm_event_t get_queued_event(struct apm_user *as) +{ + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + return as->events[as->event_tail]; +} + +static void queue_event(apm_event_t event, struct apm_user *sender) +{ + struct apm_user *as; + + spin_lock(&user_list_lock); + if (user_list == NULL) + goto out; + for (as = user_list; as != NULL; as = as->next) { + if ((as == sender) || (!as->reader)) + continue; + if (++as->event_head >= APM_MAX_EVENTS) + as->event_head = 0; + + if (as->event_head == as->event_tail) { + static int notified; + + if (notified++ == 0) + pr_err("an event queue overflowed\n"); + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + } + as->events[as->event_head] = event; + if (!as->suser || !as->writer) + continue; + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_pending++; + suspends_pending++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_pending++; + standbys_pending++; + break; + } + } + wake_up_interruptible(&apm_waitqueue); +out: + spin_unlock(&user_list_lock); +} + +static void reinit_timer(void) +{ +#ifdef INIT_TIMER_AFTER_SUSPEND + unsigned long flags; + + raw_spin_lock_irqsave(&i8253_lock, flags); + /* set the clock to HZ */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + udelay(10); + outb_p(LATCH >> 8, PIT_CH0); /* MSB */ + udelay(10); + raw_spin_unlock_irqrestore(&i8253_lock, flags); +#endif +} + +static int suspend(int vetoable) +{ + int err; + struct apm_user *as; + + dpm_suspend_start(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + + local_irq_enable(); + + save_processor_state(); + err = set_system_power_state(APM_STATE_SUSPEND); + ignore_normal_resume = 1; + restore_processor_state(); + + local_irq_disable(); + reinit_timer(); + + if (err == APM_NO_ERROR) + err = APM_SUCCESS; + if (err != APM_SUCCESS) + apm_error("suspend", err); + err = (err == APM_SUCCESS) ? 0 : -EIO; + + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); + + queue_event(APM_NORMAL_RESUME, NULL); + spin_lock(&user_list_lock); + for (as = user_list; as != NULL; as = as->next) { + as->suspend_wait = 0; + as->suspend_result = err; + } + spin_unlock(&user_list_lock); + wake_up_interruptible(&apm_suspend_waitqueue); + return err; +} + +static void standby(void) +{ + int err; + + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + local_irq_enable(); + + err = set_system_power_state(APM_STATE_STANDBY); + if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) + apm_error("standby", err); + + local_irq_disable(); + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); +} + +static apm_event_t get_event(void) +{ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + apm_eventinfo_t info; + + static int notified; + + /* we don't use the eventinfo */ + error = apm_get_event(&event, &info); + if (error == APM_SUCCESS) + return event; + + if ((error != APM_NO_EVENTS) && (notified++ == 0)) + apm_error("get_event", error); + + return 0; +} + +static void check_events(void) +{ + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; + + while ((event = get_event()) != 0) { + if (debug) { + if (event <= NR_APM_EVENT_NAME) + printk(KERN_DEBUG "apm: received %s notify\n", + apm_event_name[event - 1]); + else + printk(KERN_DEBUG "apm: received unknown " + "event 0x%02x\n", event); + } + if (ignore_bounce + && (time_after(jiffies, last_resume + bounce_interval))) + ignore_bounce = 0; + + switch (event) { + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + queue_event(event, NULL); + if (standbys_pending <= 0) + standby(); + break; + + case APM_USER_SUSPEND: +#ifdef CONFIG_APM_IGNORE_USER_SUSPEND + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; +#endif + case APM_SYS_SUSPEND: + if (ignore_bounce) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; + } + /* + * If we are already processing a SUSPEND, + * then further SUSPEND events from the BIOS + * will be ignored. We also return here to + * cope with the fact that the Thinkpads keep + * sending a SUSPEND event until something else + * happens! + */ + if (ignore_sys_suspend) + return; + ignore_sys_suspend = 1; + queue_event(event, NULL); + if (suspends_pending <= 0) + (void) suspend(1); + break; + + case APM_NORMAL_RESUME: + case APM_CRITICAL_RESUME: + case APM_STANDBY_RESUME: + ignore_sys_suspend = 0; + last_resume = jiffies; + ignore_bounce = 1; + if ((event != APM_NORMAL_RESUME) + || (ignore_normal_resume == 0)) { + dpm_resume_end(PMSG_RESUME); + queue_event(event, NULL); + } + ignore_normal_resume = 0; + break; + + case APM_CAPABILITY_CHANGE: + case APM_LOW_BATTERY: + case APM_POWER_STATUS_CHANGE: + queue_event(event, NULL); + /* If needed, notify drivers here */ + break; + + case APM_UPDATE_TIME: + break; + + case APM_CRITICAL_SUSPEND: + /* + * We are not allowed to reject a critical suspend. + */ + (void)suspend(0); + break; + } + } +} + +static void apm_event_handler(void) +{ + static int pending_count = 4; + int err; + + if ((standbys_pending > 0) || (suspends_pending > 0)) { + if ((apm_info.connection_version > 0x100) && + (pending_count-- <= 0)) { + pending_count = 4; + if (debug) + printk(KERN_DEBUG "apm: setting state busy\n"); + err = set_system_power_state(APM_STATE_BUSY); + if (err) + apm_error("busy", err); + } + } else + pending_count = 4; + check_events(); +} + +/* + * This is the APM thread main loop. + */ + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&apm_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + schedule_timeout(APM_CHECK_TIMEOUT); + if (kthread_should_stop()) + break; + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + set_current_state(TASK_INTERRUPTIBLE); + apm_event_handler(); + } + remove_wait_queue(&apm_waitqueue, &wait); +} + +static int check_apm_user(struct apm_user *as, const char *func) +{ + if (as == NULL || as->magic != APM_BIOS_MAGIC) { + pr_err("%s passed bad filp\n", func); + return 1; + } + return 0; +} + +static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) +{ + struct apm_user *as; + int i; + apm_event_t event; + + as = fp->private_data; + if (check_apm_user(as, "read")) + return -EIO; + if ((int)count < sizeof(apm_event_t)) + return -EINVAL; + if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) + return -EAGAIN; + wait_event_interruptible(apm_waitqueue, !queue_empty(as)); + i = count; + while ((i >= sizeof(event)) && !queue_empty(as)) { + event = get_queued_event(as); + if (copy_to_user(buf, &event, sizeof(event))) { + if (i < count) + break; + return -EFAULT; + } + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_read++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_read++; + break; + } + buf += sizeof(event); + i -= sizeof(event); + } + if (i < count) + return count - i; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +static __poll_t do_poll(struct file *fp, poll_table *wait) +{ + struct apm_user *as; + + as = fp->private_data; + if (check_apm_user(as, "poll")) + return 0; + poll_wait(fp, &apm_waitqueue, wait); + if (!queue_empty(as)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static long do_ioctl(struct file *filp, u_int cmd, u_long arg) +{ + struct apm_user *as; + int ret; + + as = filp->private_data; + if (check_apm_user(as, "ioctl")) + return -EIO; + if (!as->suser || !as->writer) + return -EPERM; + switch (cmd) { + case APM_IOC_STANDBY: + mutex_lock(&apm_mutex); + if (as->standbys_read > 0) { + as->standbys_read--; + as->standbys_pending--; + standbys_pending--; + } else + queue_event(APM_USER_STANDBY, as); + if (standbys_pending <= 0) + standby(); + mutex_unlock(&apm_mutex); + break; + case APM_IOC_SUSPEND: + mutex_lock(&apm_mutex); + if (as->suspends_read > 0) { + as->suspends_read--; + as->suspends_pending--; + suspends_pending--; + } else + queue_event(APM_USER_SUSPEND, as); + if (suspends_pending <= 0) { + ret = suspend(1); + mutex_unlock(&apm_mutex); + } else { + as->suspend_wait = 1; + mutex_unlock(&apm_mutex); + wait_event_interruptible(apm_suspend_waitqueue, + as->suspend_wait == 0); + ret = as->suspend_result; + } + return ret; + default: + return -ENOTTY; + } + return 0; +} + +static int do_release(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = filp->private_data; + if (check_apm_user(as, "release")) + return 0; + filp->private_data = NULL; + if (as->standbys_pending > 0) { + standbys_pending -= as->standbys_pending; + if (standbys_pending <= 0) + standby(); + } + if (as->suspends_pending > 0) { + suspends_pending -= as->suspends_pending; + if (suspends_pending <= 0) + (void) suspend(1); + } + spin_lock(&user_list_lock); + if (user_list == as) + user_list = as->next; + else { + struct apm_user *as1; + + for (as1 = user_list; + (as1 != NULL) && (as1->next != as); + as1 = as1->next) + ; + if (as1 == NULL) + pr_err("filp not in user list\n"); + else + as1->next = as->next; + } + spin_unlock(&user_list_lock); + kfree(as); + return 0; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = kmalloc(sizeof(*as), GFP_KERNEL); + if (as == NULL) + return -ENOMEM; + + as->magic = APM_BIOS_MAGIC; + as->event_tail = as->event_head = 0; + as->suspends_pending = as->standbys_pending = 0; + as->suspends_read = as->standbys_read = 0; + /* + * XXX - this is a tiny bit broken, when we consider BSD + * process accounting. If the device is opened by root, we + * instantly flag that we used superuser privs. Who knows, + * we might close the device immediately without doing a + * privileged operation -- cevans + */ + as->suser = capable(CAP_SYS_ADMIN); + as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; + as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; + spin_lock(&user_list_lock); + as->next = user_list; + user_list = as; + spin_unlock(&user_list_lock); + filp->private_data = as; + return 0; +} + +#ifdef CONFIG_PROC_FS +static int proc_apm_show(struct seq_file *m, void *v) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + unsigned short ac_line_status = 0xff; + unsigned short battery_status = 0xff; + unsigned short battery_flag = 0xff; + int percentage = -1; + int time_units = -1; + char *units = "?"; + + if ((num_online_cpus() == 1) && + !(error = apm_get_power_status(&bx, &cx, &dx))) { + ac_line_status = (bx >> 8) & 0xff; + battery_status = bx & 0xff; + if ((cx & 0xff) != 0xff) + percentage = cx & 0xff; + + if (apm_info.connection_version > 0x100) { + battery_flag = (cx >> 8) & 0xff; + if (dx != 0xffff) { + units = (dx & 0x8000) ? "min" : "sec"; + time_units = dx & 0x7fff; + } + } + } + /* Arguments, with symbols from linux/apm_bios.h. Information is + from the Get Power Status (0x0a) call unless otherwise noted. + + 0) Linux driver version (this will change if format changes) + 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. + 2) APM flags from APM Installation Check (0x00): + bit 0: APM_16_BIT_SUPPORT + bit 1: APM_32_BIT_SUPPORT + bit 2: APM_IDLE_SLOWS_CLOCK + bit 3: APM_BIOS_DISABLED + bit 4: APM_BIOS_DISENGAGED + 3) AC line status + 0x00: Off-line + 0x01: On-line + 0x02: On backup power (BIOS >= 1.1 only) + 0xff: Unknown + 4) Battery status + 0x00: High + 0x01: Low + 0x02: Critical + 0x03: Charging + 0x04: Selected battery not present (BIOS >= 1.2 only) + 0xff: Unknown + 5) Battery flag + bit 0: High + bit 1: Low + bit 2: Critical + bit 3: Charging + bit 7: No system battery + 0xff: Unknown + 6) Remaining battery life (percentage of charge): + 0-100: valid + -1: Unknown + 7) Remaining battery life (time units): + Number of remaining minutes or seconds + -1: Unknown + 8) min = minutes; sec = seconds */ + + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); + return 0; +} +#endif + +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + char *power_stat; + char *bat_stat; + + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ + set_cpus_allowed_ptr(current, cpumask_of(0)); + BUG_ON(smp_processor_id() != 0); + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; + if (apm_info.connection_version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_info.connection_version > 0x0102) + apm_info.connection_version = 0x0102; + error = apm_driver_version(&apm_info.connection_version); + if (error != APM_SUCCESS) { + apm_error("driver version", error); + /* Fall back to an APM 1.0 connection. */ + apm_info.connection_version = 0x100; + } + } + } + + if (debug) + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_info.connection_version >> 8) & 0xff, + apm_info.connection_version & 0xff); + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_info.bios.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(1); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + + if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (error) { + apm_error("engage power management", error); + return -1; + } + } + + if (debug && (num_online_cpus() == 1 || smp)) { + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; + } + switch (bx & 0xff) { + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_info.connection_version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + + /* Install our power off handler.. */ + if (power_off) + pm_power_off = apm_power_off; + + if (num_online_cpus() == 1 || smp) { +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = apm_console_blank; +#endif + apm_mainloop(); +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = NULL; +#endif + } + + return 0; +} + +#ifndef MODULE +static int __init apm_setup(char *str) +{ + int invert; + + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "off", 3) == 0) + apm_disabled = 1; + if (strncmp(str, "on", 2) == 0) + apm_disabled = 0; + if ((strncmp(str, "bounce-interval=", 16) == 0) || + (strncmp(str, "bounce_interval=", 16) == 0)) + bounce_interval = simple_strtol(str + 16, NULL, 0); + if ((strncmp(str, "idle-threshold=", 15) == 0) || + (strncmp(str, "idle_threshold=", 15) == 0)) + idle_threshold = simple_strtol(str + 15, NULL, 0); + if ((strncmp(str, "idle-period=", 12) == 0) || + (strncmp(str, "idle_period=", 12) == 0)) + idle_period = simple_strtol(str + 12, NULL, 0); + invert = (strncmp(str, "no-", 3) == 0) || + (strncmp(str, "no_", 3) == 0); + if (invert) + str += 3; + if (strncmp(str, "debug", 5) == 0) + debug = !invert; + if ((strncmp(str, "power-off", 9) == 0) || + (strncmp(str, "power_off", 9) == 0)) + power_off = !invert; + if (strncmp(str, "smp", 3) == 0) { + smp = !invert; + idle_threshold = 100; + } + if ((strncmp(str, "allow-ints", 10) == 0) || + (strncmp(str, "allow_ints", 10) == 0)) + apm_info.allow_ints = !invert; + if ((strncmp(str, "broken-psr", 10) == 0) || + (strncmp(str, "broken_psr", 10) == 0)) + apm_info.get_power_status_broken = !invert; + if ((strncmp(str, "realmode-power-off", 18) == 0) || + (strncmp(str, "realmode_power_off", 18) == 0)) + apm_info.realmode_power_off = !invert; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("apm=", apm_setup); +#endif + +static const struct file_operations apm_bios_fops = { + .owner = THIS_MODULE, + .read = do_read, + .poll = do_poll, + .unlocked_ioctl = do_ioctl, + .open = do_open, + .release = do_release, + .llseek = noop_llseek, +}; + +static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", + &apm_bios_fops +}; + + +/* Simple "print if true" callback */ +static int __init print_if_true(const struct dmi_system_id *d) +{ + printk("%s\n", d->ident); + return 0; +} + +/* + * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was + * disabled before the suspend. Linux used to get terribly confused by that. + */ +static int __init broken_ps2_resume(const struct dmi_system_id *d) +{ + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); + return 0; +} + +/* Some bioses have a broken protected mode poweroff and need to use realmode */ +static int __init set_realmode_power_off(const struct dmi_system_id *d) +{ + if (apm_info.realmode_power_off == 0) { + apm_info.realmode_power_off = 1; + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); + } + return 0; +} + +/* Some laptops require interrupts to be enabled during APM calls */ +static int __init set_apm_ints(const struct dmi_system_id *d) +{ + if (apm_info.allow_ints == 0) { + apm_info.allow_ints = 1; + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); + } + return 0; +} + +/* Some APM bioses corrupt memory or just plain do not work */ +static int __init apm_is_horked(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + } + return 0; +} + +static int __init apm_is_horked_d850md(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); + } + return 0; +} + +/* Some APM bioses hang on APM idle calls */ +static int __init apm_likes_to_melt(const struct dmi_system_id *d) +{ + if (apm_info.forbid_idle == 0) { + apm_info.forbid_idle = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); + } + return 0; +} + +/* + * Check for clue free BIOS implementations who use + * the following QA technique + * + * [ Write BIOS Code ]<------ + * | ^ + * < Does it Compile >----N-- + * |Y ^ + * < Does it Boot Win98 >-N-- + * |Y + * [Ship It] + * + * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) + * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) + */ +static int __init broken_apm_power(const struct dmi_system_id *d) +{ + apm_info.get_power_status_broken = 1; + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); + return 0; +} + +/* + * This bios swaps the APM minute reporting bytes over (Many sony laptops + * have this problem). + */ +static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) +{ + apm_info.get_power_status_swabinminutes = 1; + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); + return 0; +} + +static const struct dmi_system_id apm_dmi_table[] __initconst = { + { + print_if_true, + KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, + }, + { /* Handle problems with APM on the C600 */ + broken_ps2_resume, "Dell Latitude C600", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, + }, + { /* Allow interrupts during suspend on Dell Latitude laptops*/ + set_apm_ints, "Dell Latitude", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Dell Inspiron laptops*/ + set_apm_ints, "Dell Inspiron", { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, + }, + { /* Handle problems with APM on Inspiron 5000e */ + broken_apm_power, "Dell Inspiron 5000e", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A04"), + DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, + }, + { /* Handle problems with APM on Inspiron 2500 */ + broken_apm_power, "Dell Inspiron 2500", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A12"), + DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Dimension 4100", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Compaq Laptops*/ + set_apm_ints, "Compaq 12XL125", + { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, + }, + { /* Allow interrupts during APM or the clock goes slow */ + set_apm_ints, "ASUSTeK", + { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, + }, + { /* APM blows on shutdown */ + apm_is_horked, "ABIT KX7-333[R]", + { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), + DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, + }, + { /* APM crashes */ + apm_is_horked, "Trigem Delhi3", + { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), + DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, + }, + { /* APM crashes */ + apm_is_horked, "Fujitsu-Siemens", + { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), + DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, + }, + { /* APM crashes */ + apm_is_horked_d850md, "Intel D850MD", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, + }, + { /* APM crashes */ + apm_is_horked, "Intel D810EMO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell XPS-Z", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, + }, + { /* APM crashes */ + apm_is_horked, "Sharp PC-PJ/AX", + { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), + DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "Jabil AMD", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "AMI Bios", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), + DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505VX */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), + DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-XG29 */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), + DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), + DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), + DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), + DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), + DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-F104K */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), + DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, + }, + + { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), + DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), + DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), + DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, + }, + { /* broken PM poweroff bios */ + set_realmode_power_off, "Award Software v4.60 PGMA", + { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), + DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, + }, + + /* Generic per vendor APM settings */ + + { /* Allow interrupts during suspend on IBM laptops */ + set_apm_ints, "IBM", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + + { } +}; + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) +{ + struct desc_struct *gdt; + int err; + + dmi_check_system(apm_dmi_table); + + if (apm_info.bios.version == 0 || machine_is_olpc()) { + printk(KERN_INFO "apm: BIOS not found.\n"); + return -ENODEV; + } + printk(KERN_INFO + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); + if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { + printk(KERN_INFO "apm: no 32 bit BIOS support\n"); + return -ENODEV; + } + + if (allow_ints) + apm_info.allow_ints = 1; + if (broken_psr) + apm_info.get_power_status_broken = 1; + if (realmode_power_off) + apm_info.realmode_power_off = 1; + /* User can override, but default is to trust DMI */ + if (apm_disabled != -1) + apm_info.disabled = apm_disabled; + + /* + * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 + * but is reportedly a 1.0 BIOS. + */ + if (apm_info.bios.version == 0x001) + apm_info.bios.version = 0x100; + + /* BIOS < 1.2 doesn't set cseg_16_len */ + if (apm_info.bios.version < 0x102) + apm_info.bios.cseg_16_len = 0; /* 64k */ + + if (debug) { + printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x", + apm_info.bios.cseg, apm_info.bios.offset, + apm_info.bios.cseg_16, apm_info.bios.dseg); + if (apm_info.bios.version > 0x100) + printk(" cseg len %x, dseg len %x", + apm_info.bios.cseg_len, + apm_info.bios.dseg_len); + if (apm_info.bios.version > 0x101) + printk(" cseg16 len %x", apm_info.bios.cseg_16_len); + printk("\n"); + } + + if (apm_info.disabled) { + pr_notice("disabled on user request.\n"); + return -ENODEV; + } + if ((num_online_cpus() > 1) && !power_off && !smp) { + pr_notice("disabled - APM is not SMP safe.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + if (!acpi_disabled) { + pr_notice("overridden by ACPI.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + + /* + * Set up the long jump entry point to the APM BIOS, which is called + * from inline assembly. + */ + apm_bios_entry.offset = apm_info.bios.offset; + apm_bios_entry.segment = APM_CS; + + /* + * The APM 1.1 BIOS is supposed to provide limit information that it + * recognizes. Many machines do this correctly, but many others do + * not restrict themselves to their claimed limit. When this happens, + * they will cause a segmentation violation in the kernel at boot time. + * Most BIOS's, however, will respect a 64k limit, so we use that. + * + * Note we only set APM segments on CPU zero, since we pin the APM + * code to that CPU. + */ + gdt = get_cpu_gdt_rw(0); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); + + proc_create_single("apm", 0, NULL, proc_apm_show); + + kapmd_task = kthread_create(apm, NULL, "kapmd"); + if (IS_ERR(kapmd_task)) { + pr_err("disabled - Unable to start kernel thread\n"); + err = PTR_ERR(kapmd_task); + kapmd_task = NULL; + remove_proc_entry("apm", NULL); + return err; + } + wake_up_process(kapmd_task); + + if (num_online_cpus() > 1 && !smp) { + printk(KERN_NOTICE + "apm: disabled - APM is not SMP safe (power off active).\n"); + return 0; + } + + /* + * Note we don't actually care if the misc_device cannot be registered. + * this driver can do its job without it, even if userspace can't + * control it. just log the error + */ + if (misc_register(&apm_device)) + printk(KERN_WARNING "apm: Could not register misc device.\n"); + + if (HZ != 100) + idle_period = (idle_period * HZ) / 100; + if (idle_threshold < 100) { + cpuidle_poll_state_init(&apm_idle_driver); + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); + } + + return 0; +} + +static void __exit apm_exit(void) +{ + int error; + + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 0); + if (error) + apm_error("disengage power management", error); + } + misc_deregister(&apm_device); + remove_proc_entry("apm", NULL); + if (power_off) + pm_power_off = NULL; + if (kapmd_task) { + kthread_stop(kapmd_task); + kapmd_task = NULL; + } +} + +module_init(apm_init); +module_exit(apm_exit); + +MODULE_AUTHOR("Stephen Rothwell"); +MODULE_DESCRIPTION("Advanced Power Management"); +MODULE_LICENSE("GPL"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(debug, "Enable debug mode"); +module_param(power_off, bool, 0444); +MODULE_PARM_DESC(power_off, "Enable power off"); +module_param(bounce_interval, int, 0444); +MODULE_PARM_DESC(bounce_interval, + "Set the number of ticks to ignore suspend bounces"); +module_param(allow_ints, bool, 0444); +MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); +module_param(broken_psr, bool, 0444); +MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); +module_param(realmode_power_off, bool, 0444); +MODULE_PARM_DESC(realmode_power_off, + "Switch to real mode before powering off"); +module_param(idle_threshold, int, 0444); +MODULE_PARM_DESC(idle_threshold, + "System idle percentage above which to make APM BIOS idle calls"); +module_param(idle_period, int, 0444); +MODULE_PARM_DESC(idle_period, + "Period (in sec/100) over which to calculate the idle percentage"); +module_param(smp, bool, 0444); +MODULE_PARM_DESC(smp, + "Set this to enable APM use on an SMP platform. Use with caution on older systems"); +MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c new file mode 100644 index 000000000..70b7154f4 --- /dev/null +++ b/arch/x86/kernel/asm-offsets.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XEN +#include +#endif + +#ifdef CONFIG_X86_32 +# include "asm-offsets_32.c" +#else +# include "asm-offsets_64.c" +#endif + +static void __used common(void) +{ + BLANK(); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + BLANK(); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip); + + BLANK(); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); +#endif + +#ifdef CONFIG_PARAVIRT_XXL + BLANK(); + OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); + OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); + OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); + OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); + OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_secure_boot, boot_params, secure_boot); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); + + /* Offset for fields in tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c new file mode 100644 index 000000000..6e043f295 --- /dev/null +++ b/arch/x86/kernel/asm-offsets_32.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + +#include + +#include + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); + + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_GS, pt_regs, gs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); + BLANK(); + + OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); + BLANK(); + + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); + +#ifdef CONFIG_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + + BLANK(); + DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); +} diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c new file mode 100644 index 000000000..1354bc306 --- /dev/null +++ b/arch/x86/kernel/asm-offsets_64.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + +#include + +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#include +#endif + +int main(void) +{ +#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL + OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, + cpu.usergs_sysret64); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); +#endif +#endif + BLANK(); +#endif + +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) + OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); + BLANK(); +#endif + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); + ENTRY(r8); + ENTRY(r9); + ENTRY(r10); + ENTRY(r11); + ENTRY(r12); + ENTRY(r13); + ENTRY(r14); + ENTRY(r15); + ENTRY(flags); + BLANK(); +#undef ENTRY + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) + ENTRY(cr0); + ENTRY(cr2); + ENTRY(cr3); + ENTRY(cr4); + ENTRY(gdt_desc); + BLANK(); +#undef ENTRY + + BLANK(); + +#ifdef CONFIG_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); + BLANK(); +#endif + return 0; +} diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c new file mode 100644 index 000000000..83d9cad4e --- /dev/null +++ b/arch/x86/kernel/audit_64.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +static unsigned dir_class[] = { +#include +~0U +}; + +static unsigned read_class[] = { +#include +~0U +}; + +static unsigned write_class[] = { +#include +~0U +}; + +static unsigned chattr_class[] = { +#include +~0U +}; + +static unsigned signal_class[] = { +#include +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_IA32_EMULATION + if (arch == AUDIT_ARCH_I386) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + case __NR_execveat: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_IA32_EMULATION + extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; + extern __u32 ia32_chattr_class[]; + extern __u32 ia32_signal_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c new file mode 100644 index 000000000..3fed7ae58 --- /dev/null +++ b/arch/x86/kernel/bootflag.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement 'Simple Boot Flag Specification 2.0' + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SBF_RESERVED (0x78) +#define SBF_PNPOS (1<<0) +#define SBF_BOOTING (1<<1) +#define SBF_DIAG (1<<2) +#define SBF_PARITY (1<<7) + +int sbf_port __initdata = -1; /* set via acpi_boot_init() */ + +static int __init parity(u8 v) +{ + int x = 0; + int i; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; + } + + return x; +} + +static void __init sbf_write(u8 v) +{ + unsigned long flags; + + if (sbf_port != -1) { + v &= ~SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; + + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(v, sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + } +} + +static u8 __init sbf_read(void) +{ + unsigned long flags; + u8 v; + + if (sbf_port == -1) + return 0; + + spin_lock_irqsave(&rtc_lock, flags); + v = CMOS_READ(sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + + return v; +} + +static int __init sbf_value_valid(u8 v) +{ + if (v & SBF_RESERVED) /* Reserved bits */ + return 0; + if (!parity(v)) + return 0; + + return 1; +} + +static int __init sbf_init(void) +{ + u8 v; + + if (sbf_port == -1) + return 0; + + v = sbf_read(); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } + + v &= ~SBF_RESERVED; + v &= ~SBF_BOOTING; + v &= ~SBF_DIAG; +#if defined(CONFIG_ISAPNP) + v |= SBF_PNPOS; +#endif + sbf_write(v); + + return 0; +} +arch_initcall(sbf_init); diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c new file mode 100644 index 000000000..03bb2f343 --- /dev/null +++ b/arch/x86/kernel/cc_platform.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + */ + +#include +#include +#include + +#include + +static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_INTEL_TDX_GUEST + return false; +#else + return false; +#endif +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * cc_platform_has() function is used for this. When a distinction isn't + * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +static bool amd_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return sme_me_mask; + + case CC_ATTR_HOST_MEM_ENCRYPT: + return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED); + + case CC_ATTR_GUEST_MEM_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ENABLED; + + case CC_ATTR_GUEST_STATE_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ES_ENABLED; + + default: + return false; + } +#else + return false; +#endif +} + + +bool cc_platform_has(enum cc_attr attr) +{ + if (sme_me_mask) + return amd_cc_platform_has(attr); + + return false; +} +EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 000000000..5136e6818 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct scan_area { + u64 addr; + u64 size; +} scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + +static __init int set_corruption_check(char *arg) +{ + ssize_t ret; + unsigned long val; + + if (!arg) { + pr_err("memory_corruption_check config string not provided\n"); + return -EINVAL; + } + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + memory_corruption_check = val; + + return 0; +} +early_param("memory_corruption_check", set_corruption_check); + +static __init int set_corruption_check_period(char *arg) +{ + ssize_t ret; + unsigned long val; + + if (!arg) { + pr_err("memory_corruption_check_period config string not provided\n"); + return -EINVAL; + } + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + corruption_check_period = val; + return 0; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static __init int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + if (!arg) { + pr_err("memory_corruption_check_size config string not provided\n"); + return -EINVAL; + } + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + phys_addr_t start, end; + u64 i; + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; + + memblock_reserve(start, end - start); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; + + /* Assume we've already mapped this early memory */ + memset(__va(start), 0, end - start); + + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; + } + + if (num_scan_areas) + pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas); +} + + +static void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) +{ + check_for_bios_corruption(); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); +} + +static int start_periodic_check_for_corruption(void) +{ + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) + return 0; + + pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period); + + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + + return 0; +} +device_initcall(start_periodic_check_for_corruption); + diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 000000000..0bca7ef74 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +capflags.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile new file mode 100644 index 000000000..93792b457 --- /dev/null +++ b/arch/x86/kernel/cpu/Makefile @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for x86-compatible CPU details, features and quirks +# + +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg +endif + +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + +# As above, instrumenting secondary CPU boot code causes boot hangs. +KCSAN_SANITIZE_common.o := n + +# Make sure load_percpu_segment has no stackprotector +CFLAGS_common.o := -fno-stack-protector + +obj-y := cacheinfo.o scattered.o topology.o +obj-y += common.o +obj-y += rdrand.o +obj-y += match.o +obj-y += bugs.o +obj-y += aperfmperf.o +obj-y += cpuid-deps.o +obj-y += umwait.o + +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +ifdef CONFIG_CPU_SUP_INTEL +obj-y += intel.o intel_pconfig.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o +endif +obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o + +obj-$(CONFIG_X86_MCE) += mce/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_MICROCODE) += microcode/ +obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + +obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_ACRN_GUEST) += acrn.o + +ifdef CONFIG_X86_FEATURE_NAMES +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../include/asm/cpufeatures.h +vmxfeature = $(src)/../../include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) +endif +targets += capflags.c diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c new file mode 100644 index 000000000..0b2c03943 --- /dev/null +++ b/arch/x86/kernel/cpu/acrn.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN detection support + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + * + * Jason Chen CJ + * Zhao Yakui + * + */ + +#include +#include +#include +#include +#include +#include +#include + +static u32 __init acrn_detect(void) +{ + return hypervisor_cpuid_base("ACRNACRNACRN", 0); +} + +static void __init acrn_init_platform(void) +{ + /* Setup the IDT for ACRN hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); +} + +static bool acrn_x2apic_available(void) +{ + return boot_cpu_has(X86_FEATURE_X2APIC); +} + +static void (*acrn_intr_handler)(void); + +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * The hypervisor requires that the APIC EOI should be acked. + * If the APIC EOI is not acked, the APIC ISR bit for the + * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it + * will block the interrupt whose vector is lower than + * HYPERVISOR_CALLBACK_VECTOR. + */ + ack_APIC_irq(); + inc_irq_stat(irq_hv_callback_count); + + if (acrn_intr_handler) + acrn_intr_handler(); + + set_irq_regs(old_regs); +} + +const __initconst struct hypervisor_x86 x86_hyper_acrn = { + .name = "ACRN", + .detect = acrn_detect, + .type = X86_HYPER_ACRN, + .init.init_platform = acrn_init_platform, + .init.x2apic_available = acrn_x2apic_available, +}; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c new file mode 100644 index 000000000..f29c6bed9 --- /dev/null +++ b/arch/x86/kernel/cpu/amd.c @@ -0,0 +1,1346 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +# include +# include +#endif + +#include "cpu.h" + +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX + * Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + +static const int amd_zenbleed[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), + AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), + AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), + AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); + +static const int amd_div0[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), + AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); + +static const int amd_erratum_1485[] = + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), + AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_stepping; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + +/* + * B step AMD K6 before B 9730xxxx have hardware bugs that can cause + * misexecution of code under Linux. Owners of such processors should + * contact AMD for precise details and a CPU swap. + * + * See http://www.multimania.com/poulot/k6bug.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) + * + * The following test is erm.. interesting. AMD neglected to up + * the chip setting when fixing the bug but they also tweaked some + * performance at the same time.. + */ + +#ifdef CONFIG_X86_32 +extern __visible void vide(void); +__asm__(".text\n" + ".globl vide\n" + ".type vide, @function\n" + ".align 4\n" + "vide: ret\n"); +#endif + +static void init_amd_k5(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortunately, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); + } +#endif +} + +static void init_amd_k6(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + u32 l, h; + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); + + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_stepping == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + u64 d, d2; + + pr_info("AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + OPTIMIZER_HIDE_VAR(f_vide); + d = rdtsc(); + while (n--) + f_vide(); + d2 = rdtsc(); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + pr_cont("system stability may be impaired when more than 32 MB are used.\n"); + else + pr_cont("probably OK (after B9730xxxx).\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_stepping < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + pr_info("Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_stepping > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + pr_info("Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +#endif +} + +static void init_amd_k7(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + pr_info("Enabling disabled K7/SSE Support.\n"); + msr_clear_bit(MSR_K7_HWCR, 15); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_stepping == 0) || + (c->x86_stepping == 1))) + return; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_stepping == 0)) + return; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_stepping >= 2)) || + ((c->x86_model == 7) && (c->x86_stepping >= 1)) || + (c->x86_model > 7)) + if (cpu_has(c, X86_FEATURE_MP)) + return; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + " processors is not suitable for SMP.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); +#endif +} + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + +/* + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units + */ +static void amd_get_topology(struct cpuinfo_x86 *c) +{ + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + c->cpu_die_id = ecx & 0xff; + + if (c->x86 == 0x15) + c->cu_id = ebx & 0xff; + + if (c->x86 >= 0x17) { + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + } + + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + cacheinfo_amd_init_llc_id(c, cpu); + + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + c->cpu_die_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else + return; + + if (nodes_per_socket > 1) { + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + legacy_fixup_core_id(c); + } +} + +/* + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void amd_detect_cmp(struct cpuinfo_x86 *c) +{ + unsigned bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; +} + +static void amd_detect_ppin(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + if (!cpu_has(c, X86_FEATURE_AMD_PPIN)) + return; + + /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */ + if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val)) + goto clear_ppin; + + /* PPIN is locked in disabled mode, clear feature bit */ + if ((val & 3UL) == 1UL) + goto clear_ppin; + + /* If PPIN is disabled, try to enable it */ + if (!(val & 2UL)) { + wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_AMD_PPIN_CTL, &val); + } + + /* If PPIN_EN bit is 1, return from here; otherwise fall through */ + if (val & 2UL) + return; + +clear_ppin: + clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); +} + +u16 amd_get_nb_id(int cpu) +{ + return per_cpu(cpu_llc_id, cpu); +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + +u32 amd_get_nodes_per_socket(void) +{ + return nodes_per_socket; +} +EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_amd(struct cpuinfo_x86 *c) +{ + +#ifdef CONFIG_X86_64 + if (c->x86 >= 0xf) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + pr_debug("tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_int() & va_align.mask; + } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; + } + + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && + c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } + + resctrl_cpu_detect(c); +} + +static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) +{ + u64 msr; + + /* + * BIOS support is required for SME and SEV. + * For SME: If BIOS has enabled SME then adjust x86_phys_bits by + * the SME physical address space reduction value. + * If BIOS has not enabled SME then don't advertise the + * SME feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise the + * SEV and SEV_ES feature (set in scattered.c). + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { + /* Check if memory encryption is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + goto clear_all; + + /* + * Always adjust physical address bits. Even though this + * will be a value above 32-bits this is still done for + * CONFIG_X86_32 so that accurate values are reported. + */ + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + + if (IS_ENABLED(CONFIG_X86_32)) + goto clear_all; + + rdmsrl(MSR_K7_HWCR, msr); + if (!(msr & MSR_K7_HWCR_SMMLOCK)) + goto clear_sev; + + return; + +clear_all: + setup_clear_cpu_cap(X86_FEATURE_SME); +clear_sev: + setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + } +} + +static void early_init_amd(struct cpuinfo_x86 *c) +{ + u64 value; + u32 dummy; + + early_init_amd_mc(c); + +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_stepping >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* + * ApicID can always be treated as an 8-bit value for AMD APIC versions + * >= 0x10, but even old K8s came out of reset with version 0x10. So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally for families + * after 16h. + */ + if (boot_cpu_has(X86_FEATURE_APIC)) { + if (c->x86 > 0x16) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + else if (c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ + unsigned int val; + + val = read_pci_config(0, 24, 0, 0x68); + if ((val >> 17 & 0x3) == 0x3) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } + } +#endif + + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + + /* F16h erratum 793, CVE-2013-6885 */ + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); + + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); + + early_detect_mem_encrypt(c); + + /* Re-enable TopologyExtensions if switched off by BIOS */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + +static void init_amd_k8(struct cpuinfo_x86 *c) +{ + u32 level; + u64 value; + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 revision D + * (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). + */ + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~BIT_64(32); + wrmsrl_amd_safe(0xc001100d, value); + } + } + + if (!c->x86_model_id[0]) + strcpy(c->x86_model_id, "Hammer"); + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + msr_set_bit(MSR_K7_HWCR, 6); +#endif + set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); +} + +static void init_amd_gh(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_MMCONF_FAM10H + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); +#endif + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here because this + * is always needed when GART is enabled, even in a kernel which has no + * MCE support built in. BIOS should disable GartTlbWlk Errors already. + * If it doesn't, we do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); + + /* + * On family 10h BIOS may not have properly enabled WC+ support, causing + * it to be converted to CD memtype. This may result in performance + * degradation for certain nested-paging guests. Prevent this conversion + * by clearing bit 24 in MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); + + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); +} + +static void init_amd_ln(struct cpuinfo_x86 *c) +{ + /* + * Apply erratum 665 fix unconditionally so machines without a BIOS + * fix work. + */ + msr_set_bit(MSR_AMD64_DE_CFG, 31); +} + +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + +static void init_amd_bd(struct cpuinfo_x86 *c) +{ + u64 value; + + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(MSR_F15H_IC_CFG, value); + } + } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_ZEN); + +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } +} + +static bool cpu_has_zenbleed_microcode(void) +{ + u32 good_rev = 0; + + switch (boot_cpu_data.x86_model) { + case 0x30 ... 0x3f: good_rev = 0x0830107a; break; + case 0x60 ... 0x67: good_rev = 0x0860010b; break; + case 0x68 ... 0x6f: good_rev = 0x08608105; break; + case 0x70 ... 0x7f: good_rev = 0x08701032; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + + default: + return false; + break; + } + + if (boot_cpu_data.microcode < good_rev) + return false; + + return true; +} + +static void zenbleed_check(struct cpuinfo_x86 *c) +{ + if (!cpu_has_amd_erratum(c, amd_zenbleed)) + return; + + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (!cpu_has(c, X86_FEATURE_AVX)) + return; + + if (!cpu_has_zenbleed_microcode()) { + pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); + msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } else { + msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); + } +} + +static void init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + if (c->x86 >= 0x10) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); + + switch (c->x86) { + case 4: init_amd_k5(c); break; + case 5: init_amd_k6(c); break; + case 6: init_amd_k7(c); break; + case 0xf: init_amd_k8(c); break; + case 0x10: init_amd_gh(c); break; + case 0x12: init_amd_ln(c); break; + case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; + case 0x17: init_spectral_chicken(c); + fallthrough; + case 0x19: init_amd_zn(c); break; + } + + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); + + cpu_detect_cache_sizes(c); + + amd_detect_cmp(c); + amd_get_topology(c); + srat_detect_node(c); + amd_detect_ppin(c); + + init_amd_cacheinfo(c); + + if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * Use LFENCE for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); + + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + /* + * Turn on the Instructions Retired free counter on machines not + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (cpu_has(c, X86_FEATURE_IRPERF) && + !cpu_has_amd_erratum(c, amd_erratum_1054)) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); + + zenbleed_check(c); + + if (cpu_has_amd_erratum(c, amd_div0)) { + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); + } + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + cpu_has_amd_erratum(c, amd_erratum_1485)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); +} + +#ifdef CONFIG_X86_32 +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* AMD errata T13 (order #21922) */ + if (c->x86 == 6) { + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_stepping == 0) + size = 64; + /* Tbird rev A1/A2 */ + if (c->x86_model == 4 && + (c->x86_stepping == 0 || c->x86_stepping == 1)) + size = 256; + } + return size; +} +#endif + +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + +static const struct cpu_dev amd_cpu_dev = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [3] = "486 DX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB", + [14] = "Am5x86-WT", + [15] = "Am5x86-WB" + } + }, + }, + .legacy_cache_size = amd_size_cache, +#endif + .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, + .c_bsp_init = bsp_init_amd, + .c_init = init_amd, + .c_x86_vendor = X86_VENDOR_AMD, +}; + +cpu_dev_register(amd_cpu_dev); + +void set_dr_addr_mask(unsigned long mask, int dr) +{ + if (!boot_cpu_has(X86_FEATURE_BPEXT)) + return; + + switch (dr) { + case 0: + wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0); + break; + case 1: + case 2: + case 3: + wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0); + break; + default: + break; + } +} + +bool cpu_has_ibpb_brtype_microcode(void) +{ + switch (boot_cpu_data.x86) { + /* Zen1/2 IBPB flushes branch type predictions too. */ + case 0x17: + return boot_cpu_has(X86_FEATURE_AMD_IBPB); + case 0x19: + /* Poke the MSR bit on Zen3/4 to check its presence. */ + if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + setup_force_cpu_cap(X86_FEATURE_SBPB); + return true; + } else { + return false; + } + default: + return false; + } +} + +static void zenbleed_check_cpu(void *unused) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + zenbleed_check(c); +} + +void amd_check_microcode(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + on_each_cpu(zenbleed_check_cpu, NULL, 1); +} + +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +void noinstr amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); +} +EXPORT_SYMBOL_GPL(amd_clear_divider); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000..e2f319dc9 --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +struct aperfmperf_sample { + unsigned int khz; + ktime_t time; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +#define APERFMPERF_CACHE_THRESHOLD_MS 10 +#define APERFMPERF_REFRESH_DELAY_MS 10 +#define APERFMPERF_STALE_THRESHOLD_MS 1000 + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = this_cpu_ptr(&samples); + unsigned long flags; + + local_irq_save(flags); + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* + * There is no architectural guarantee that MPERF + * increments faster than we can read it. + */ + if (mperf_delta == 0) + return; + + s->time = ktime_get(); + s->aperf = aperf; + s->mperf = mperf; + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); +} + +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +{ + s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return true; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + + /* Return false if the previous iteration was too long ago. */ + return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +} + +unsigned int aperfmperf_get_khz(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); + return per_cpu(samples.khz, cpu); +} + +void arch_freq_prepare_all(void) +{ + ktime_t now = ktime_get(); + bool wait = false; + int cpu; + + if (!cpu_khz) + return; + + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + for_each_online_cpu(cpu) { + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + continue; + if (!aperfmperf_snapshot_cpu(cpu, now, false)) + wait = true; + } + + if (wait) + msleep(APERFMPERF_REFRESH_DELAY_MS); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) + return per_cpu(samples.khz, cpu); + + msleep(APERFMPERF_REFRESH_DELAY_MS); + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c new file mode 100644 index 000000000..d9fda0b6e --- /dev/null +++ b/arch/x86/kernel/cpu/bugs.c @@ -0,0 +1,2757 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); +static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); +static void __init mds_select_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); +static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); +static void __init srbds_select_mitigation(void); +static void __init gds_select_mitigation(void); +static void __init srso_select_mitigation(void); + +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ +u64 x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + +u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; +EXPORT_SYMBOL_GPL(x86_pred_cmd); + +static DEFINE_MUTEX(spec_ctrl_mutex); + +void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; + +/* Update SPEC_CTRL MSR and its cached copy unconditionally */ +static void update_spec_ctrl(u64 val) +{ + this_cpu_write(x86_spec_ctrl_current, val); + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +/* + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). + */ +void update_spec_ctrl_cond(u64 val) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; + +/* Control conditional STIBP in switch_to() */ +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); +/* Control conditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); +/* Control unconditional IBPB in switch_mm() */ +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); + +/* Control MDS CPU buffer clear before returning to user space */ +DEFINE_STATIC_KEY_FALSE(mds_user_clear); +EXPORT_SYMBOL_GPL(mds_user_clear); +/* Control MDS CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); +EXPORT_SYMBOL_GPL(mds_idle_clear); + +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); + +void __init cpu_select_mitigations(void) +{ + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); + /* + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET or IBPB. + */ + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); + srbds_select_mitigation(); + + /* + * srso_select_mitigation() depends and must run after + * retbleed_select_mitigation(). + */ + srso_select_mitigation(); + gds_select_mitigation(); +} + +/* + * NOTE: For VMX, this function is not called in the vmexit path. + * It uses vmx_spec_ctrl_restore_host() instead. + */ +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); + struct thread_info *ti = current_thread_info(); + + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); + } + } + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculation_ctrl_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} + +#undef pr_fmt +#define pr_fmt(fmt) "MDS: " fmt + +/* Default mitigation for MDS-affected CPUs */ +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static bool mds_nosmt __ro_after_init = false; + +static const char * const mds_strings[] = { + [MDS_MITIGATION_OFF] = "Vulnerable", + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", +}; + +static void __init mds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { + mds_mitigation = MDS_MITIGATION_OFF; + return; + } + + if (mds_mitigation == MDS_MITIGATION_FULL) { + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + mds_mitigation = MDS_MITIGATION_VMWERV; + + static_branch_enable(&mds_user_clear); + + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && + (mds_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + } +} + +static int __init mds_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + mds_mitigation = MDS_MITIGATION_OFF; + else if (!strcmp(str, "full")) + mds_mitigation = MDS_MITIGATION_FULL; + else if (!strcmp(str, "full,nosmt")) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_nosmt = true; + } + + return 0; +} +early_param("mds", mds_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + return; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + return; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "GDS: " fmt + +enum gds_mitigations { + GDS_MITIGATION_OFF, + GDS_MITIGATION_UCODE_NEEDED, + GDS_MITIGATION_FORCE, + GDS_MITIGATION_FULL, + GDS_MITIGATION_FULL_LOCKED, + GDS_MITIGATION_HYPERVISOR, +}; + +#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; +#else +static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; +#endif + +static const char * const gds_strings[] = { + [GDS_MITIGATION_OFF] = "Vulnerable", + [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", + [GDS_MITIGATION_FULL] = "Mitigation: Microcode", + [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", + [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +bool gds_ucode_mitigated(void) +{ + return (gds_mitigation == GDS_MITIGATION_FULL || + gds_mitigation == GDS_MITIGATION_FULL_LOCKED); +} +EXPORT_SYMBOL_GPL(gds_ucode_mitigated); + +void update_gds_msr(void) +{ + u64 mcu_ctrl_after; + u64 mcu_ctrl; + + switch (gds_mitigation) { + case GDS_MITIGATION_OFF: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl |= GDS_MITG_DIS; + break; + case GDS_MITIGATION_FULL_LOCKED: + /* + * The LOCKED state comes from the boot CPU. APs might not have + * the same state. Make sure the mitigation is enabled on all + * CPUs. + */ + case GDS_MITIGATION_FULL: + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + mcu_ctrl &= ~GDS_MITG_DIS; + break; + case GDS_MITIGATION_FORCE: + case GDS_MITIGATION_UCODE_NEEDED: + case GDS_MITIGATION_HYPERVISOR: + return; + }; + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + /* + * Check to make sure that the WRMSR value was not ignored. Writes to + * GDS_MITG_DIS will be ignored if this processor is locked but the boot + * processor was not. + */ + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); +} + +static void __init gds_select_mitigation(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + gds_mitigation = GDS_MITIGATION_HYPERVISOR; + goto out; + } + + if (cpu_mitigations_off()) + gds_mitigation = GDS_MITIGATION_OFF; + /* Will verify below that mitigation _can_ be disabled */ + + /* No microcode */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } else { + gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; + } + goto out; + } + + /* Microcode has mitigation, use it */ + if (gds_mitigation == GDS_MITIGATION_FORCE) + gds_mitigation = GDS_MITIGATION_FULL; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + if (mcu_ctrl & GDS_MITG_LOCKED) { + if (gds_mitigation == GDS_MITIGATION_OFF) + pr_warn("Mitigation locked. Disable failed.\n"); + + /* + * The mitigation is selected from the boot CPU. All other CPUs + * _should_ have the same state. If the boot CPU isn't locked + * but others are then update_gds_msr() will WARN() of the state + * mismatch. If the boot CPU is locked update_gds_msr() will + * ensure the other CPUs have the mitigation enabled. + */ + gds_mitigation = GDS_MITIGATION_FULL_LOCKED; + } + + update_gds_msr(); +out: + pr_info("%s\n", gds_strings[gds_mitigation]); +} + +static int __init gds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return 0; + + if (!strcmp(str, "off")) + gds_mitigation = GDS_MITIGATION_OFF; + else if (!strcmp(str, "force")) + gds_mitigation = GDS_MITIGATION_FORCE; + + return 0; +} +early_param("gather_data_sampling", gds_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V1 : " fmt + +enum spectre_v1_mitigation { + SPECTRE_V1_MITIGATION_NONE, + SPECTRE_V1_MITIGATION_AUTO, +}; + +static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = + SPECTRE_V1_MITIGATION_AUTO; + +static const char * const spectre_v1_strings[] = { + [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", + [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", +}; + +/* + * Does SMAP provide full mitigation against speculative kernel access to + * userspace? + */ +static bool smap_works_speculatively(void) +{ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return false; + + /* + * On CPUs which are vulnerable to Meltdown, SMAP does not + * prevent speculative access to user data in the L1 cache. + * Consider SMAP to be non-functional as a mitigation on these + * CPUs. + */ + if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) + return false; + + return true; +} + +static void __init spectre_v1_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return; + } + + if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { + /* + * With Spectre v1, a user can speculatively control either + * path of a conditional swapgs with a user-controlled GS + * value. The mitigation is to add lfences to both code paths. + * + * If FSGSBASE is enabled, the user can put a kernel address in + * GS, in which case SMAP provides no protection. + * + * If FSGSBASE is disabled, the user can only put a user space + * address in GS. That makes an attack harder, but still + * possible if there's no SMAP protection. + */ + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { + /* + * Mitigation can be provided from SWAPGS itself or + * PTI as the CR3 write in the Meltdown mitigation + * is serializing. + * + * If neither is there, mitigate with an LFENCE to + * stop speculation through swapgs. + */ + if (boot_cpu_has_bug(X86_BUG_SWAPGS) && + !boot_cpu_has(X86_FEATURE_PTI)) + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); + + /* + * Enable lfences in the kernel entry (non-swapgs) + * paths, to prevent user entry from speculatively + * skipping swapgs. + */ + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); + } + } + + pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); +} + +static int __init nospectre_v1_cmdline(char *str) +{ + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return 0; +} +early_param("nospectre_v1", nospectre_v1_cmdline); + +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, +}; + +const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (IS_ENABLED(CONFIG_RETHUNK)) + x86_return_thunk = retbleed_return_thunk; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + mitigate_smt = true; + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = + SPECTRE_V2_USER_NONE; + +#ifdef CONFIG_RETPOLINE +static bool spectre_v2_bad_module; + +bool retpoline_module_ok(bool has_retpoline) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + + pr_err("System may be vulnerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; +} + +static inline const char *spectre_v2_module_string(void) +{ + return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } +#endif + +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +/* The kernel command line selection for spectre v2 */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, +}; + +enum spectre_v2_user_cmd { + SPECTRE_V2_USER_CMD_NONE, + SPECTRE_V2_USER_CMD_AUTO, + SPECTRE_V2_USER_CMD_FORCE, + SPECTRE_V2_USER_CMD_PRCTL, + SPECTRE_V2_USER_CMD_PRCTL_IBPB, + SPECTRE_V2_USER_CMD_SECCOMP, + SPECTRE_V2_USER_CMD_SECCOMP_IBPB, +}; + +static const char * const spectre_v2_user_strings[] = { + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", + [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", +}; + +static const struct { + const char *option; + enum spectre_v2_user_cmd cmd; + bool secure; +} v2_user_options[] __initconst = { + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, + { "off", SPECTRE_V2_USER_CMD_NONE, false }, + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, +}; + +static void __init spec_v2_user_print_cond(const char *reason, bool secure) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) + pr_info("spectre_v2_user=%s forced on command line.\n", reason); +} + +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + +static enum spectre_v2_user_cmd __init +spectre_v2_parse_user_cmdline(void) +{ + char arg[20]; + int ret, i; + + switch (spectre_v2_cmd) { + case SPECTRE_V2_CMD_NONE: + return SPECTRE_V2_USER_CMD_NONE; + case SPECTRE_V2_CMD_FORCE: + return SPECTRE_V2_USER_CMD_FORCE; + default: + break; + } + + ret = cmdline_find_option(boot_command_line, "spectre_v2_user", + arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_USER_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { + if (match_option(arg, ret, v2_user_options[i].option)) { + spec_v2_user_print_cond(v2_user_options[i].option, + v2_user_options[i].secure); + return v2_user_options[i].cmd; + } + } + + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_USER_CMD_AUTO; +} + +static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} + +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; +} + +static void __init +spectre_v2_user_select_mitigation(void) +{ + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; + bool smt_possible = IS_ENABLED(CONFIG_SMP); + enum spectre_v2_user_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + smt_possible = false; + + cmd = spectre_v2_parse_user_cmdline(); + switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + goto set_mode; + case SPECTRE_V2_USER_CMD_FORCE: + mode = SPECTRE_V2_USER_STRICT; + break; + case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: + mode = SPECTRE_V2_USER_PRCTL; + break; + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_SECCOMP: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPECTRE_V2_USER_SECCOMP; + else + mode = SPECTRE_V2_USER_PRCTL; + break; + } + + /* Initialize Indirect Branch Prediction Barrier */ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + + spectre_v2_user_ibpb = mode; + switch (cmd) { + case SPECTRE_V2_USER_CMD_FORCE: + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + static_branch_enable(&switch_mm_always_ibpb); + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + break; + case SPECTRE_V2_USER_CMD_PRCTL: + case SPECTRE_V2_USER_CMD_AUTO: + case SPECTRE_V2_USER_CMD_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } + + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } + + /* + * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP + * is not required. + * + * Enhanced IBRS also protects against cross-thread branch target + * injection in user-mode as the IBRS bit remains always set which + * implicitly enables cross-thread protections. However, in legacy IBRS + * mode, the IBRS bit is set only on kernel entry and cleared on return + * to userspace. This disables the implicit cross-thread protection, + * so allow for STIBP to be selected in that case. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + return; + + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + + spectre_v2_user_stibp = mode; + +set_mode: + pr_info("%s\n", spectre_v2_user_strings[mode]); +} + +static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", +}; + +static const struct { + const char *option; + enum spectre_v2_mitigation_cmd cmd; + bool secure; +} mitigation_options[] __initconst = { + { "off", SPECTRE_V2_CMD_NONE, false }, + { "on", SPECTRE_V2_CMD_FORCE, true }, + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, +}; + +static void __init spec_v2_print_cond(const char *reason, bool secure) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) + pr_info("%s selected on command line.\n", reason); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) + return SPECTRE_V2_CMD_NONE; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { + if (!match_option(arg, ret, mitigation_options[i].option)) + continue; + cmd = mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE || + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + spec_v2_print_cond(mitigation_options[i].option, + mitigation_options[i].secure); + return cmd; +} + +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_EIBRS; + break; + } + + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; + break; + + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + mode = SPECTRE_V2_RETPOLINE; + break; + + case SPECTRE_V2_CMD_RETPOLINE: + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; + break; + } + + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_ibrs_mode(mode)) { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; + } + + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? + */ + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + + /* + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. + */ + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } + + /* Set up IBPB and STIBP depending on the general spectre V2 command */ + spectre_v2_cmd = cmd; +} + +static void update_stibp_msr(void * __unused) +{ + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + update_spec_ctrl(val); +} + +/* Update x86_spec_ctrl_base in case SMT state changed. */ +static void update_stibp_strict(void) +{ + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + + if (sched_smt_active()) + mask |= SPEC_CTRL_STIBP; + + if (mask == x86_spec_ctrl_base) + return; + + pr_info("Update user space SMT mitigation: STIBP %s\n", + mask & SPEC_CTRL_STIBP ? "always-on" : "off"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); +} + +/* Update the static key controlling the evaluation of TIF_SPEC_IB */ +static void update_indir_branch_cond(void) +{ + if (sched_smt_active()) + static_branch_enable(&switch_to_cond_stibp); + else + static_branch_disable(&switch_to_cond_stibp); +} + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +/* Update the static key controlling the MDS CPU buffer clear in idle */ +static void update_mds_branch_idle(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable the idle clearing if SMT is active on CPUs which are + * affected only by MSBDS and not any other MDS variant. + * + * The other variants cannot be mitigated when SMT is enabled, so + * clearing the buffers on idle just to prevent the Store Buffer + * repartitioning leak would be a window dressing exercise. + */ + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) + return; + + if (sched_smt_active()) { + static_branch_enable(&mds_idle_clear); + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { + static_branch_disable(&mds_idle_clear); + } +} + +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char * const ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] __initconst = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may + * use a completely different MSR and bit dependent on family. + */ + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { + x86_amd_ssb_disable(); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + update_spec_ctrl(x86_spec_ctrl_base); + } + } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static void task_update_spec_tif(struct task_struct *tsk) +{ + /* Force the update of the real TIF bits */ + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); + + /* + * Immediately update the speculation control MSRs for the current + * task, but for a non-current task delay setting the CPU + * mitigation until it is scheduled next. + * + * This can only happen for SECCOMP mitigation. For PRCTL it's + * always the current task. + */ + if (tsk == current) + speculation_ctrl_update_current(); +} + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + default: + return -ERANGE; + } + return 0; +} + +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return 0; + + /* + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. + */ + if (!is_spec_ib_user_controlled() || + task_spec_ib_force_disable(task)) + return -EPERM; + + task_clear_spec_ib_disable(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + case PR_SPEC_FORCE_DISABLE: + /* + * Indirect branch speculation is always allowed when + * mitigation is force disabled. + */ + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return -EPERM; + + if (!is_spec_ib_user_controlled()) + return 0; + + task_set_spec_ib_disable(task); + if (ctrl == PR_SPEC_FORCE_DISABLE) + task_set_spec_ib_force_disable(task); + task_update_spec_tif(task); + if (task == current) + indirect_branch_prediction_barrier(); + break; + default: + return -ERANGE; + } + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +static int ib_prctl_get(struct task_struct *task) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) + return PR_SPEC_ENABLE; + else if (is_spec_ib_user_controlled()) { + if (task_spec_ib_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ib_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else + return PR_SPEC_NOT_AFFECTED; +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + update_spec_ctrl(x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} + +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +#if IS_ENABLED(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(l1tf_mitigation); +#endif +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); + +/* + * These CPUs all support 44bits physical address space internally in the + * cache but CPUID can report a smaller number of physical address bits. + * + * The L1TF mitigation uses the top most address bit for the inversion of + * non present PTEs. When the installed memory reaches into the top most + * address bit due to memory holes, which has been observed on machines + * which report 36bits physical address bits and have 32G RAM installed, + * then the mitigation range check in l1tf_select_mitigation() triggers. + * This is a false positive because the mitigation is still possible due to + * the fact that the cache uses 44bit internally. Use the cache bits + * instead of the reported physical bits and adjust them on the affected + * machines to 44bit if the reported bits are less than 44. + */ +static void override_cache_bits(struct cpuinfo_x86 *c) +{ + if (c->x86 != 6) + return; + + switch (c->x86_model) { + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_SANDYBRIDGE: + case INTEL_FAM6_IVYBRIDGE: + case INTEL_FAM6_HASWELL: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_G: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + if (c->x86_cache_bits < 44) + c->x86_cache_bits = 44; + break; + } +} + +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + + override_cache_bits(&boot_cpu_data); + + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + break; + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + cpu_smt_disable(false); + break; + case L1TF_MITIGATION_FULL_FORCE: + cpu_smt_disable(true); + break; + } + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (l1tf_mitigation != L1TF_MITIGATION_OFF && + e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", + half_pa); + pr_info("However, doing so will make a part of your RAM unusable.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + +static int __init l1tf_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (!strcmp(str, "flush,nowarn")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; + else if (!strcmp(str, "flush")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + else if (!strcmp(str, "flush,nosmt")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else if (!strcmp(str, "full")) + l1tf_mitigation = L1TF_MITIGATION_FULL; + else if (!strcmp(str, "full,force")) + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; + + return 0; +} +early_param("l1tf", l1tf_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt + +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, +}; + +enum srso_mitigation_cmd { + SRSO_CMD_OFF, + SRSO_CMD_MICROCODE, + SRSO_CMD_SAFE_RET, + SRSO_CMD_IBPB, + SRSO_CMD_IBPB_ON_VMEXIT, +}; + +static const char * const srso_strings[] = { + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", + [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; +static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; + +static int __init srso_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + srso_cmd = SRSO_CMD_OFF; + else if (!strcmp(str, "microcode")) + srso_cmd = SRSO_CMD_MICROCODE; + else if (!strcmp(str, "safe-ret")) + srso_cmd = SRSO_CMD_SAFE_RET; + else if (!strcmp(str, "ibpb")) + srso_cmd = SRSO_CMD_IBPB; + else if (!strcmp(str, "ibpb-vmexit")) + srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + else + pr_err("Ignoring unknown SRSO option (%s).", str); + + return 0; +} +early_param("spec_rstack_overflow", srso_parse_cmdline); + +#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." + +static void __init srso_select_mitigation(void) +{ + bool has_microcode; + + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + goto pred_cmd; + + /* + * The first check is for the kernel running as a guest in order + * for guests to verify whether IBPB is a viable mitigation. + */ + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode(); + if (!has_microcode) { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + } else { + /* + * Enable the synthetic (even if in a real CPUID leaf) + * flags for guests. + */ + setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); + + /* + * Zen1/2 with SMT off aren't vulnerable after the right + * IBPB microcode has been applied. + */ + if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { + setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + return; + } + } + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (has_microcode) { + pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n"); + srso_mitigation = SRSO_MITIGATION_IBPB; + goto pred_cmd; + } + } + + switch (srso_cmd) { + case SRSO_CMD_OFF: + goto pred_cmd; + + case SRSO_CMD_MICROCODE: + if (has_microcode) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + pr_warn(SRSO_NOTICE); + } + break; + + case SRSO_CMD_SAFE_RET: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + x86_return_thunk = srso_alias_return_thunk; + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + x86_return_thunk = srso_return_thunk; + } + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB: + if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (has_microcode) { + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + srso_mitigation = SRSO_MITIGATION_IBPB; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto pred_cmd; + } + break; + + case SRSO_CMD_IBPB_ON_VMEXIT: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + } + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + goto pred_cmd; + } + break; + + default: + break; + } + + pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); + +pred_cmd: + if ((!boot_cpu_has_bug(X86_BUG_SRSO) || srso_cmd == SRSO_CMD_OFF) && + boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; +} + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +#ifdef CONFIG_SYSFS + +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static const char * const l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" +}; + +static ssize_t l1tf_show_state(char *buf) +{ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && + sched_smt_active())) { + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); + } + + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !boot_cpu_has(X86_FEATURE_VMX)) + return sprintf(buf, "KVM: Mitigation: VMX unsupported\n"); + else if (!(cr4_read_shadow() & X86_CR4_VMXE)) + return sprintf(buf, "KVM: Mitigation: VMX disabled\n"); + else if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} +#else +static ssize_t l1tf_show_state(char *buf) +{ + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); +} + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} +#endif + +static ssize_t mds_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + mds_strings[mds_mitigation]); + } + + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : + sched_smt_active() ? "mitigated" : "disabled")); + } + + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + +static char *stibp_state(void) +{ + if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) + return ""; + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + return ", STIBP: disabled"; + case SPECTRE_V2_USER_STRICT: + return ", STIBP: forced"; + case SPECTRE_V2_USER_STRICT_PREFERRED: + return ", STIBP: always-on"; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + if (static_key_enabled(&switch_to_cond_stibp)) + return ", STIBP: conditional"; + } + return ""; +} + +static char *ibpb_state(void) +{ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + if (static_key_enabled(&switch_mm_always_ibpb)) + return ", IBPB: always-on"; + if (static_key_enabled(&switch_mm_cond_ibpb)) + return ", IBPB: conditional"; + return ", IBPB: disabled"; + } + return ""; +} + +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sprintf(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); +} + +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + + return sprintf(buf, "%s; SMT %s\n", + retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + +static ssize_t gds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); +} + +static ssize_t srso_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_SRSO_NO)) + return sysfs_emit(buf, "Mitigation: SMT disabled\n"); + + return sysfs_emit(buf, "%s%s\n", + srso_strings[srso_mitigation], + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); +} + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) +{ + if (!boot_cpu_has_bug(bug)) + return sprintf(buf, "Not affected\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) + return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); + + case X86_BUG_SPECTRE_V2: + return spectre_v2_show_state(buf); + + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; + + case X86_BUG_MDS: + return mds_show_state(buf); + + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + + case X86_BUG_SRBDS: + return srbds_show_state(buf); + + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + + case X86_BUG_GDS: + return gds_show_state(buf); + + case X86_BUG_SRSO: + return srso_show_state(buf); + + default: + break; + } + + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); +} + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} + +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); +} + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} + +ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_GDS); +} + +ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); +} +#endif diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c new file mode 100644 index 000000000..b458b0fd9 --- /dev/null +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -0,0 +1,1035 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Routines to identify caches on Intel CPU. + * + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Ashok Raj : Work with CPU hotplug infrastructure. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cpu.h" + +#define LVL_1_INST 1 +#define LVL_1_DATA 2 +#define LVL_2 3 +#define LVL_3 4 +#define LVL_TRACE 5 + +struct _cache_table { + unsigned char descriptor; + char cache_type; + short size; +}; + +#define MB(x) ((x) * 1024) + +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + +static const struct _cache_table cache_table[] = +{ + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ + { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ + { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ + { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ + { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ + { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ + { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ + { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ + { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ + { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ + { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ + { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ + { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ + { 0x00, 0, 0} +}; + + +enum _cache_type { + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 +}; + +union _cpuid4_leaf_eax { + struct { + enum _cache_type type:5; + unsigned int level:3; + unsigned int is_self_initializing:1; + unsigned int is_fully_associative:1; + unsigned int reserved:4; + unsigned int num_threads_sharing:12; + unsigned int num_cores_on_die:6; + } split; + u32 full; +}; + +union _cpuid4_leaf_ebx { + struct { + unsigned int coherency_line_size:12; + unsigned int physical_line_partition:10; + unsigned int ways_of_associativity:10; + } split; + u32 full; +}; + +union _cpuid4_leaf_ecx { + struct { + unsigned int number_of_sets:32; + } split; + u32 full; +}; + +struct _cpuid4_info_regs { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned int id; + unsigned long size; + struct amd_northbridge *nb; +}; + +static unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; + }; + unsigned val; +}; + +union l3_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; + }; + unsigned val; +}; + +static const unsigned short assocs[] = { + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; + +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +static void +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + fallthrough; + case 0: + if (!l1->val) + return; + assoc = assocs[l1->assoc]; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + break; + case 2: + if (!l2.val) + return; + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + break; + case 3: + if (!l3.val) + return; + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } + break; + default: + return; + } + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + + + if (assoc == 0xffff) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} + +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, + unsigned int slot) +{ + int index; + struct amd_northbridge *nb = this_leaf->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return show_cache_disable(this_leaf, buf, slot); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + struct amd_northbridge *nb = this_leaf->priv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&this_leaf->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ + return store_cache_disable(this_leaf, buf, count, slot); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t +cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!this_leaf->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + int n = 1; + static struct attribute **amd_l3_attrs; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group * +cache_get_priv_group(struct cacheinfo *this_leaf) +{ + struct amd_northbridge *nb = this_leaf->priv; + + if (this_leaf->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = amd_get_nb_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} +#else +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ + +static int +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +{ + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + amd_init_l3_cache(this_leaf, index); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + + if (eax.split.type == CTYPE_NULL) + return -EIO; /* better error ? */ + + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); + return 0; +} + +static int find_num_cache_leaves(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) + op = 0x8000001d; + else + op = 4; + + do { + ++i; + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + } while (cache_eax.split.type != CTYPE_NULL); + return i; +} + +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + if (c->x86 < 0x17) { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* + * LLC ID is calculated from the number of threads sharing the + * cache. + * */ + u32 eax, ebx, ecx, edx, num_sharing_cache = 0; + u32 llc_index = find_num_cache_leaves(c) - 1; + + cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); + if (eax) + num_sharing_cache = ((eax >> 14) & 0xfff) + 1; + + if (num_sharing_cache) { + int bits = get_count_order(num_sharing_cache); + + per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + } + } +} + +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) +{ + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (!cpuid_edx(0x80000006)) + return; + + /* + * LLC is at the core complex level. + * Core complex ID is ApicId[3] for these processors. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; +} + +void init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + +void init_hygon_cacheinfo(struct cpuinfo_x86 *c) +{ + num_cache_leaves = find_num_cache_leaves(c); +} + +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ + unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; +#ifdef CONFIG_SMP + unsigned int cpu = c->cpu_index; +#endif + + if (c->cpuid_level > 3) { + static int is_initialized; + + if (is_initialized == 0) { + /* Init num_cache_leaves from boot CPU */ + num_cache_leaves = find_num_cache_leaves(c); + is_initialized++; + } + + /* + * Whenever possible use cpuid(4), deterministic cache + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { + struct _cpuid4_info_regs this_leaf = {}; + int retval; + + retval = cpuid4_cache_lookup_regs(i, &this_leaf); + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CTYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CTYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; + } + } + } + /* + * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for + * trace cache + */ + if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + /* supports eax=2 call */ + int j, n; + unsigned int regs[4]; + unsigned char *dp = (unsigned char *)regs; + int only_trace = 0; + + if (num_cache_leaves != 0 && c->x86 == 15) + only_trace = 1; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) { + unsigned char des = dp[j]; + unsigned char k = 0; + + /* look up this descriptor in the table */ + while (cache_table[k].descriptor != 0) { + if (cache_table[k].descriptor == des) { + if (only_trace && cache_table[k].cache_type != LVL_TRACE) + break; + switch (cache_table[k].cache_type) { + case LVL_1_INST: + l1i += cache_table[k].size; + break; + case LVL_1_DATA: + l1d += cache_table[k].size; + break; + case LVL_2: + l2 += cache_table[k].size; + break; + case LVL_3: + l3 += cache_table[k].size; + break; + case LVL_TRACE: + trace += cache_table[k].size; + break; + } + + break; + } + + k++; + } + } + } + } + + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) { + l2 = new_l2; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l2_id; +#endif + } + + if (new_l3) { + l3 = new_l3; +#ifdef CONFIG_SMP + per_cpu(cpu_llc_id, cpu) = l3_id; +#endif + } + +#ifdef CONFIG_SMP + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + + if (!l2) + cpu_detect_cache_sizes(c); +} + +static int __cache_amd_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf; + int i, sibling; + + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + unsigned int apicid, nshared, first, last; + + nshared = base->eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; + + this_leaf = this_cpu_ci->info_list + index; + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else + return 0; + + return 1; +} + +static void __cache_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD || + c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, base)) + return; + } + + this_leaf = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + if (num_threads_sharing == 1) + return; + + index_msb = get_count_order(num_threads_sharing); + + for_each_online_cpu(i) + if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + + if (i == cpu || !sib_cpu_ci->info_list) + continue;/* skip if itself or no cacheinfo */ + sibling_leaf = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + } +} + +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct _cpuid4_info_regs *base) +{ + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; + this_leaf->level = base->eax.split.level; + this_leaf->type = cache_type_map[base->eax.split.type]; + this_leaf->coherency_line_size = + base->ebx.split.coherency_line_size + 1; + this_leaf->ways_of_associativity = + base->ebx.split.ways_of_associativity + 1; + this_leaf->size = base->size; + this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; + this_leaf->physical_line_partition = + base->ebx.split.physical_line_partition + 1; + this_leaf->priv = base->nb; +} + +int init_cache_level(unsigned int cpu) +{ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + + if (!num_cache_leaves) + return -ENOENT; + if (!this_cpu_ci) + return -EINVAL; + this_cpu_ci->num_levels = 3; + this_cpu_ci->num_leaves = num_cache_leaves; + return 0; +} + +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + +int populate_cache_leaves(unsigned int cpu) +{ + unsigned int idx, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct _cpuid4_info_regs id4_regs = {}; + + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + if (ret) + return ret; + get_cache_id(cpu, &id4_regs); + ci_leaf_init(this_leaf++, &id4_regs); + __cache_cpumap_setup(cpu, idx, &id4_regs); + } + this_cpu_ci->cpu_map_populated = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c new file mode 100644 index 000000000..345f7d905 --- /dev/null +++ b/arch/x86/kernel/cpu/centaur.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include +#include +#include +#include +#include + +#include "cpu.h" + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ + +static void init_c3(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Centaur Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= ACE_FCR; /* enable ACE unit */ + wrmsr(MSR_VIA_FCR, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_VIA_RNG, lo, hi); + lo |= RNG_ENABLE; /* enable RNG unit */ + wrmsr(MSR_VIA_RNG, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* store Centaur Extended Feature Flags as + * word 5 of the CPU capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } +#ifdef CONFIG_X86_32 + /* Cyrix III family needs CX8 & PGE explicitly enabled. */ + if (c->x86_model >= 6 && c->x86_model <= 13) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= (1<<1 | 1<<7); + wrmsr(MSR_VIA_FCR, lo, hi); + set_cpu_cap(c, X86_FEATURE_CX8); + } + + /* Before Nehemiah, the C3's had 3dNOW! */ + if (c->x86_model >= 6 && c->x86_model < 9) + set_cpu_cap(c, X86_FEATURE_3DNOW); +#endif + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +} + +enum { + ECX8 = 1<<1, + EIERRINT = 1<<2, + DPM = 1<<3, + DMCE = 1<<4, + DSTPCLK = 1<<5, + ELINEAR = 1<<6, + DSMC = 1<<7, + DTLOCK = 1<<8, + EDCTLB = 1<<8, + EMMX = 1<<9, + DPDC = 1<<11, + EBRPRED = 1<<12, + DIC = 1<<13, + DDC = 1<<14, + DNA = 1<<15, + ERETSTK = 1<<16, + E2MMX = 1<<19, + EAMD3D = 1<<20, +}; + +static void early_init_centaur(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); +#endif + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } +} + +static void init_centaur(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + char *name; + u32 fcr_set = 0; + u32 fcr_clr = 0; + u32 lo, hi, newlo; + u32 aa, bb, cc, dd; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); +#endif + early_init_centaur(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + +#ifdef CONFIG_X86_32 + if (c->x86 == 5) { + switch (c->x86_model) { + case 4: + name = "C6"; + fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK; + fcr_clr = DPDC; + pr_notice("Disabling bugged TSC.\n"); + clear_cpu_cap(c, X86_FEATURE_TSC); + break; + case 8: + switch (c->x86_stepping) { + default: + name = "2"; + break; + case 7 ... 9: + name = "2A"; + break; + case 10 ... 15: + name = "2B"; + break; + } + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + case 9: + name = "3"; + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + default: + name = "??"; + } + + rdmsr(MSR_IDT_FCR1, lo, hi); + newlo = (lo|fcr_set) & (~fcr_clr); + + if (newlo != lo) { + pr_info("Centaur FCR was 0x%X now 0x%X\n", + lo, newlo); + wrmsr(MSR_IDT_FCR1, newlo, hi); + } else { + pr_info("Centaur FCR is 0x%X\n", lo); + } + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + /* Report CX8 */ + set_cpu_cap(c, X86_FEATURE_CX8); + /* Set 3DNow! on Winchip 2 and above. */ + if (c->x86_model >= 8) + set_cpu_cap(c, X86_FEATURE_3DNOW); + /* See if we can find out some more. */ + if (cpuid_eax(0x80000000) >= 0x80000005) { + /* Yes, we can. */ + cpuid(0x80000005, &aa, &bb, &cc, &dd); + /* Add L1 data and code cache sizes. */ + c->x86_cache_size = (cc>>24)+(dd>>24); + } + sprintf(c->x86_model_id, "WinChip %s", name); + } +#endif + if (c->x86 == 6 || c->x86 >= 7) + init_c3(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + init_ia32_feat_ctl(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* VIA C3 CPUs (670-68F) need further shifting. */ + if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) + size >>= 8; + + /* + * There's also an erratum in Nehemiah stepping 1, which + * returns '65KB' instead of '64KB' + * - Note, it seems this may only be in engineering samples. + */ + if ((c->x86 == 6) && (c->x86_model == 9) && + (c->x86_stepping == 1) && (size == 65)) + size -= 1; + return size; +} +#endif + +static const struct cpu_dev centaur_cpu_dev = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, + .c_init = init_centaur, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_CENTAUR, +}; + +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c new file mode 100644 index 000000000..4ecc6072e --- /dev/null +++ b/arch/x86/kernel/cpu/common.c @@ -0,0 +1,2285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +u32 elf_hwcap2 __read_mostly; + +/* all of these masks are initialized in setup_cpu_local_masks() */ +cpumask_var_t cpu_initialized_mask; +cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_callin_mask; + +/* representing cpus for which sibling maps can be computed */ +cpumask_var_t cpu_sibling_setup_mask; + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; + +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_initialized_mask); + alloc_bootmem_cpumask_var(&cpu_callin_mask); + alloc_bootmem_cpumask_var(&cpu_callout_mask); + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + +static void default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + cpu_detect_cache_sizes(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu = &default_cpu; + +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +#ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), +#else + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + GDT_STACK_CANARY_INIT +#endif +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + +#ifdef CONFIG_X86_64 +static int __init x86_nopcid_setup(char *s) +{ + /* nopcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 0; +} +early_param("nopcid", x86_nopcid_setup); +#endif + +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +int have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + pr_notice("CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +static __init int setup_disable_smep(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMEP); + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __always_inline void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) + cr4_set_bits(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMAP); + return 1; +} +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags = native_save_fl(); + + /* This should have been cleared long ago */ + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP + cr4_set_bits(X86_CR4_SMAP); +#else + clear_cpu_cap(c, X86_FEATURE_SMAP); + cr4_clear_bits(X86_CR4_SMAP); +#endif + } +} + +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + pr_info_once("x86/cpu: User Mode Instruction Prevention (UMIP) activated\n"); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + +/* These bits should not change their value after CPU init is finished. */ +static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE; +static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +static unsigned long cr4_pinned_bits __ro_after_init; + +void native_write_cr0(unsigned long val) +{ + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } +} +EXPORT_SYMBOL(native_write_cr0); + +void native_write_cr4(unsigned long val) +{ + unsigned long bits_changed = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { + bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits; + val = (val & ~cr4_pinned_mask) | cr4_pinned_bits; + goto set_register; + } + /* Warn after we've corrected the changed bits. */ + WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n", + bits_changed); + } +} +#if IS_MODULE(CONFIG_LKDTM) +EXPORT_SYMBOL_GPL(native_write_cr4); +#endif + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); + +void cr4_init(void) +{ + unsigned long cr4 = __read_cr4(); + + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits; + + __write_cr4(cr4); + + /* Initialize cr4 shadow for this CPU. */ + this_cpu_write(cpu_tlbstate.cr4, cr4); +} + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask; + static_key_enable(&cr_pinning.key); +} + +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + +/* + * Protection Keys are not available in 32-bit mode. + */ +static bool pku_disabled; + +static __always_inline void setup_pku(struct cpuinfo_x86 *c) +{ + /* check the boot processor, plus compile options for PKU: */ + if (!cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* checks the actual processor's cpuid bits: */ + if (!cpu_has(c, X86_FEATURE_PKU)) + return; + if (pku_disabled) + return; + + cr4_set_bits(X86_CR4_PKE); + /* + * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * cpuid bit to be set. We need to ensure that we + * update that bit in this CPU's "cpu_info". + */ + set_cpu_cap(c, X86_FEATURE_OSPKE); +} + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +static __init int setup_disable_pku(char *arg) +{ + /* + * Do not clear the X86_FEATURE_PKU bit. All of the + * runtime checks are against OSPKE so clearing the + * bit does nothing. + * + * This way, we will see "pku" in cpuinfo, but not + * "ospke", which is exactly what we want. It shows + * that the CPU has PKU, but the OS has not enabled it. + * This happens to be exactly how a system would look + * if we disabled the config option. + */ + pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n"); + pku_disabled = true; + return 1; +} +__setup("nopku", setup_disable_pku); +#endif /* CONFIG_X86_64 */ + +/* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software. Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { + u32 feature; + u32 level; +}; + +static const struct cpuid_dependent_feature +cpuid_dependent_features[] = { + { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_XSAVE, 0x0000000d }, + { 0, 0 } +}; + +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct cpuid_dependent_feature *df; + + for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + /* + * Note: cpuid_level is set to -1 if unavailable, but + * extended_extended_level is set to 0 if unavailable + * and the legitimate extended levels are all negative + * when signed; hence the weird messing around with + * signs here... + */ + if (!((s32)df->level < 0 ? + (u32)df->level > (u32)c->extended_cpuid_level : + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", + x86_cap_flag(df->feature), df->level); + } +} + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used + */ + +/* Look up CPU names by table lookup. */ +static const char *table_lookup_model(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->legacy_models; + + while (info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } +#endif + return NULL; /* Not found */ +} + +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); + +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + __loadsegment_simple(gs, 0); + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#endif + load_stack_canary_segment(); +} + +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +#endif + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + +/* + * Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. + */ +void switch_to_new_gdt(int cpu) +{ + /* Load the original GDT */ + load_direct_gdt(cpu); + /* Reload the per-cpu base */ + load_percpu_segment(cpu); +} + +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + +static void get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q, *s; + + if (c->extended_cpuid_level < 0x80000004) + return; + + v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Trim whitespace */ + p = q = s = &c->x86_model_id[0]; + + while (*p == ' ') + p++; + + while (*p) { + /* Note the last non-whitespace index */ + if (!isspace(*p)) + s = q; + + *q++ = *p++; + } + + *(s + 1) = '\0'; +} + +void detect_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + c->x86_max_cores = 1; + if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) + return; + + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + c->x86_max_cores = (eax >> 26) + 1; +} + +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ebx, ecx, edx, l2size; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + l2size = ecx >> 16; + +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else + /* do processor-specific cache resizing */ + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if (l2size == 0) + return; /* Again, no L2 cache is possible */ +#endif + + c->x86_cache_size = l2size; +} + +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO]; + +static void cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES]); + + pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", + tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], + tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); +} + +int detect_ht_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + + if (!cpu_has(c, X86_FEATURE_HT)) + return -1; + + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return -1; + + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return -1; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + if (smp_num_siblings == 1) + pr_info_once("CPU0: Hyper-Threading is disabled\n"); +#endif + return 0; +} + +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int index_msb, core_bits; + + if (detect_ht_early(c) < 0) + return; + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif +} + +static void get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; + } + } + + pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); + + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + +void cpu_detect(struct cpuinfo_x86 *c) +{ + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = x86_family(tfms); + c->x86_model = x86_model(tfms); + c->x86_stepping = x86_stepping(tfms); + + if (cap0 & (1<<19)) { + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; + } + } +} + +static void apply_forced_caps(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +} + +static void init_speculation_control(struct cpuinfo_x86 *c) +{ + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || + cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { + set_cpu_cap(c, X86_FEATURE_SSBD); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); + } +} + +void get_cpu_cap(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_1_ECX] = ecx; + c->x86_capability[CPUID_1_EDX] = edx; + } + + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_0_EBX] = ebx; + c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } + } + + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_D_1_EAX] = eax; + } + + /* AMD-defined flags: level 0x80000001 */ + eax = cpuid_eax(0x80000000); + c->extended_cpuid_level = eax; + + if ((eax & 0xffff0000) == 0x80000000) { + if (eax >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; + } + } + + if (c->extended_cpuid_level >= 0x80000007) { + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0007_EBX] = ebx; + c->x86_power = edx; + } + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_8000_0008_EBX] = ebx; + } + + if (c->extended_cpuid_level >= 0x8000000a) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + + if (c->extended_cpuid_level >= 0x80000021) + c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021); + + init_scattered_cpuid_features(c); + init_speculation_control(c); + + /* + * Clear/Set all flags overridden by options, after probe. + * This needs to happen each time we re-probe, which may happen + * several times during CPU initialization. + */ + apply_forced_caps(c); +} + +void get_cpu_address_sizes(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (c->extended_cpuid_level >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif + c->x86_cache_bits = c->x86_phys_bits; +} + +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) +#define NO_SPECTRE_V2 BIT(8) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) + +#define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) + +#define VULNWL_INTEL(model, whitelist) \ + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) + +#define VULNWL_AMD(family, whitelist) \ + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) + +#define VULNWL_HYGON(family, whitelist) \ + VULNWL(HYGON, family, X86_MODEL_ANY, whitelist) + +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously + * being documented as such in the APM). But according to AMD, %gs is + * updated non-speculatively, and the issuing of %gs-relative memory + * operands will be blocked until the %gs update completes, which is + * good enough for our purposes. + */ + + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + + /* AMD Family 0xf - 0x12 */ + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + + /* Zhaoxin Family 7 */ + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + {} +}; + +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + +#define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) +/* CPU is affected by GDS */ +#define GDS BIT(6) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SRSO), + VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x19, SRSO), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) +{ + const struct x86_cpu_id *m = x86_match_cpu(table); + + return m && !!(m->driver_data & which); +} + +u64 x86_read_arch_cap_msr(void) +{ + u64 ia32_cap = 0; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { + setup_force_cpu_bug(X86_BUG_MDS); + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); + } + + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + + /* + * Check if CPU is vulnerable to GDS. If running in a virtual machine on + * an affected processor, the VMM may have disabled the use of GATHER by + * disabling AVX2. The only way to do this in HW is to clear XCR0[2], + * which means that AVX will be disabled. + */ + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + boot_cpu_has(X86_FEATURE_AVX)) + setup_force_cpu_bug(X86_BUG_GDS); + + if (!cpu_has(c, X86_FEATURE_SRSO_NO)) { + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + } + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) + return; + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return; + + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); +} + +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg; + int arglen, res, bit; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + do { + res = get_option(&argptr, &bit); + if (res == 0 || res == 3) + break; + + /* If the argument was too long, the last bit may be cut off */ + if (res == 1 && arglen >= sizeof(arg)) + break; + + if (bit >= 0 && bit < NCAPINTS * 32) { + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + setup_clear_cpu_cap(bit); + } + } while (res == 2); + pr_cont("\n"); +} + +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the boot CPU. Don't add code + * here that is supposed to run on all CPUs. + */ +static void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); + + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + + c->cpu_index = 0; + filter_cpuid_features(c, false); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } else { + setup_clear_cpu_cap(X86_FEATURE_CPUID); + } + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + cpu_set_bug_bits(c); + + cpu_set_core_cap_bits(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + + /* + * Later in the boot process pgtable_l5_enabled() relies on + * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not + * enabled by this point we need to clear the feature bit to avoid + * false-positives at the later stage. + * + * pgtable_l5_enabled() can be false here for several reasons: + * - 5-level paging is disabled compile-time; + * - it's 32-bit kernel; + * - machine doesn't support 5-level paging; + * - user specified 'no5lvl' in kernel command line. + */ + if (!pgtable_l5_enabled()) + setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); +} + +void __init early_cpu_init(void) +{ + const struct cpu_dev *const *cdev; + int count = 0; + +#ifdef CONFIG_PROCESSOR_SELECT + pr_info("KERNEL supported cpus:\n"); +#endif + + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + const struct cpu_dev *cpudev = *cdev; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + +#ifdef CONFIG_PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + pr_info(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } +#endif + } + early_identify_cpu(&boot_cpu_data); +} + +static bool detect_null_seg_behavior(void) +{ + /* + * Empirically, writing zero to a segment selector on AMD does + * not clear the base, whereas writing zero to a segment + * selector on Intel does clear the base. Intel's behavior + * allows slightly faster context switches in the common case + * where GS is unused by the prev and next threads. + * + * Since neither vendor documents this anywhere that I can see, + * detect it directly instead of hardcoding the choice by + * vendor. + * + * I've designated AMD's behavior as the "bug" because it's + * counterintuitive and less friendly. + */ + + unsigned long old_base, tmp; + rdmsrl(MSR_FS_BASE, old_base); + wrmsrl(MSR_FS_BASE, 1); + loadsegment(fs, 0); + rdmsrl(MSR_FS_BASE, tmp); + wrmsrl(MSR_FS_BASE, old_base); + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); +} + +static void generic_identify(struct cpuinfo_x86 *c) +{ + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + + get_cpu_address_sizes(c); + + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_SMP + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + c->phys_proc_id = c->initial_apicid; + } + + get_model_name(c); /* Default name */ + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif +} + +/* + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. + */ +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int apicid, cpu = smp_processor_id(); + + apicid = apic->cpu_present_to_apicid(cpu); + + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", + cpu, apicid, c->initial_apicid); + } + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); + BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); +#else + c->logical_proc_id = 0; +#endif +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +static void identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = 0; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_stepping = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; + c->cu_id = 0xff; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + memset(&c->x86_capability, 0, sizeof(c->x86_capability)); +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + memset(&c->vmx_capability, 0, sizeof(c->vmx_capability)); +#endif + + generic_identify(c); + + if (this_cpu->c_identify) + this_cpu->c_identify(c); + + /* Clear/Set all flags overridden by options, after probe */ + apply_forced_caps(c); + +#ifdef CONFIG_X86_64 + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +#endif + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* Set up SMEP/SMAP/UMIP */ + setup_smep(c); + setup_smap(c); + setup_umip(c); + + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." + */ + + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + + x86_init_rdrand(c); + setup_pku(c); + + /* + * Clear/Set all flags overridden by options, need do it + * before following smp all cpus cap AND. + */ + apply_forced_caps(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + + /* OR, i.e. replicate the bug flags */ + for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++) + c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ + mcheck_cpu_init(c); + + select_idle_routine(c); + +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif +} + +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + struct tss_struct *tss; + int cpu; + + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + + cpu = get_cpu(); + tss = &per_cpu(cpu_tss_rw, cpu); + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. + */ + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +} +#endif + +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); +#endif + cpu_detect_tlb(&boot_cpu_data); + setup_cr_pinning(); + + tsx_init(); +} + +void identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + mtrr_ap_init(); + validate_apic_and_package_id(c); + x86_spec_ctrl_setup_ap(); + update_srbds_msr(); + if (boot_cpu_has_bug(X86_BUG_GDS)) + update_gds_msr(); +} + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); + return 1; +} +__setup("noclflush", setup_noclflush); + +void print_cpu_info(struct cpuinfo_x86 *c) +{ + const char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) { + vendor = this_cpu->c_vendor; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } + + if (vendor && !strstr(c->x86_model_id, vendor)) + pr_cont("%s ", vendor); + + if (c->x86_model_id[0]) + pr_cont("%s", c->x86_model_id); + else + pr_cont("%d86", c->x86); + + pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); + + if (c->x86_stepping || c->cpuid_level >= 0) + pr_cont(", stepping: 0x%x)\n", c->x86_stepping); + else + pr_cont(")\n"); +} + +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) +{ + return 1; +} +__setup("clearcpuid=", setup_clearcpuid); + +#ifdef CONFIG_X86_64 +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); + +/* + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + +#ifdef CONFIG_IA32_EMULATION + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); +#else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); +} + +#else /* CONFIG_X86_64 */ + +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + +#ifdef CONFIG_STACKPROTECTOR +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif + +#endif /* CONFIG_X86_64 */ + +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} + +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + +static void wait_for_master_cpu(int cpu) +{ +#ifdef CONFIG_SMP + /* + * wait for ACK from master CPU before continuing + * with AP initialization + */ + WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)); + while (!cpumask_test_cpu(cpu, cpu_callout_mask)) + cpu_relax(); +#endif +} + +#ifdef CONFIG_X86_64 +static inline void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); +} + +static inline void ucode_cpu_init(int cpu) +{ + if (cpu) + load_ucode_ap(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) +{ + /* Set up the per-CPU TSS IST stacks */ + tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + /* Only mapped when SEV-ES is active */ + tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); +} + +#else /* CONFIG_X86_64 */ + +static inline void setup_getcpu(int cpu) { } + +static inline void ucode_cpu_init(int cpu) +{ + show_ucode_info_early(); +} + +static inline void tss_setup_ist(struct tss_struct *tss) { } + +#endif /* !CONFIG_X86_64 */ + +static inline void tss_setup_io_bitmap(struct tss_struct *tss) +{ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; + +#ifdef CONFIG_X86_IOPL_IOPERM + tss->io_bitmap.prev_max = 0; + tss->io_bitmap.prev_sequence = 0; + memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap)); + /* + * Invalidate the extra array entry past the end of the all + * permission bitmap as required by the hardware. + */ + tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL; +#endif +} + +/* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + int cpu = raw_smp_processor_id(); + + /* paranoid_entry() gets the CPU number from the GDT */ + setup_getcpu(cpu); + + /* IST vectors need TSS to be set up. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + load_TR_desc(); + + /* Finally load the IDT */ + load_current_idt(); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT. We + * reload it nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. + */ +void cpu_init(void) +{ + struct task_struct *cur = current; + int cpu = raw_smp_processor_id(); + + wait_for_master_cpu(cpu); + + ucode_cpu_init(cpu); + +#ifdef CONFIG_NUMA + if (this_cpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); +#endif + pr_debug("Initializing CPU#%d\n", cpu); + + if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + switch_to_new_gdt(cpu); + + if (IS_ENABLED(CONFIG_X86_64)) { + loadsegment(fs, 0); + memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + x2apic_setup(); + } + + mmgrab(&init_mm); + cur->active_mm = &init_mm; + BUG_ON(cur->mm); + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, cur); + + /* + * sp0 points to the entry trampoline stack regardless of what task + * is running. + */ + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + doublefault_init_cpu_tss(); + + if (is_uv_system()) + uv_cpu_init(); + + load_fixmap_gdt(cpu); +} + +#ifdef CONFIG_SMP +void cpu_init_secondary(void) +{ + /* + * Relies on the BP having set-up the IDT tables, which are loaded + * on this CPU in cpu_init_exception_handling(). + */ + cpu_init_exception_handling(); + cpu_init(); + fpu__init_cpu(); +} +#endif + +#ifdef CONFIG_MICROCODE_LATE_LOADING +/** + * store_cpu_caps() - Store a snapshot of CPU capabilities + * @curr_info: Pointer where to store it + * + * Returns: None + */ +void store_cpu_caps(struct cpuinfo_x86 *curr_info) +{ + /* Reload CPUID max function as it might've changed. */ + curr_info->cpuid_level = cpuid_eax(0); + + /* Copy all capability leafs and pick up the synthetic ones. */ + memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability, + sizeof(curr_info->x86_capability)); + + /* Get the hardware CPUID leafs */ + get_cpu_cap(curr_info); +} + +/** + * microcode_check() - Check if any CPU capabilities changed after an update. + * @prev_info: CPU capabilities stored before an update. + * + * The microcode loader calls this upon late microcode load to recheck features, + * only when microcode has been updated. Caller holds microcode_mutex and CPU + * hotplug lock. + * + * Return: None + */ +void microcode_check(struct cpuinfo_x86 *prev_info) +{ + struct cpuinfo_x86 curr_info; + + perf_check_microcode(); + + amd_check_microcode(); + + store_cpu_caps(&curr_info); + + if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability, + sizeof(prev_info->x86_capability))) + return; + + pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); + pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); +} +#endif + +/* + * Invoked from core CPU hotplug code after hotplug operations + */ +void arch_smt_update(void) +{ + /* Handle the speculative execution misfeatures */ + cpu_bugs_smt_update(); + /* Check whether IPI broadcasting can be enabled */ + apic_smt_update(); +} + +void __init arch_cpu_finalize_init(void) +{ + identify_boot_cpu(); + + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology(); + + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + + cpu_select_mitigations(); + + arch_smt_update(); + + if (IS_ENABLED(CONFIG_X86_32)) { + /* + * Check whether this is a real i386 which is not longer + * supported and fixup the utsname. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + } + + /* + * Must be before alternatives because it might set or clear + * feature bits. + */ + fpu__init_system(); + fpu__init_cpu(); + + alternative_instructions(); + + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); + } else { + fpu__init_check_bugs(); + } + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. It + * must be called after late_time_init() so that Hyper-V x86/x64 + * hypercalls work when the SWIOTLB bounce buffers are decrypted. + */ + mem_encrypt_init(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h new file mode 100644 index 000000000..66e19380a --- /dev/null +++ b/arch/x86/kernel/cpu/cpu.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_CPU_H +#define ARCH_X86_CPU_H + +/* attempt to consolidate cpu attributes */ +struct cpu_dev { + const char *c_vendor; + + /* some have two possibilities for cpuid string */ + const char *c_ident[2]; + + void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); + void (*c_init)(struct cpuinfo_x86 *); + void (*c_identify)(struct cpuinfo_x86 *); + void (*c_detect_tlb)(struct cpuinfo_x86 *); + int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif +}; + +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; +}; + +#define cpu_dev_register(cpu_devX) \ + static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ + __section(".x86_cpu_dev.init") = \ + &cpu_devX; + +extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; + +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + +extern void get_cpu_cap(struct cpuinfo_x86 *c); +extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); +extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); + +extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology_early(struct cpuinfo_x86 *c); +extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern int detect_ht_early(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); +void cpu_select_mitigations(void); + +extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); +extern void update_gds_msr(void); + +extern u64 x86_read_arch_cap_msr(void); + +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000..d50224199 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,133 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c new file mode 100644 index 000000000..1d9b8aaea --- /dev/null +++ b/arch/x86/kernel/cpu/cyrix.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu.h" + +/* + * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU + */ +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned char ccr2, ccr3; + + /* we test for DEVID by checking whether CCR3 is writable */ + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, ccr3 ^ 0x80); + getCx86(0xc0); /* dummy to change bus */ + + if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */ + ccr2 = getCx86(CX86_CCR2); + setCx86(CX86_CCR2, ccr2 ^ 0x04); + getCx86(0xc0); /* dummy */ + + if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */ + *dir0 = 0xfd; + else { /* Cx486S A step */ + setCx86(CX86_CCR2, ccr2); + *dir0 = 0xfe; + } + } else { + setCx86(CX86_CCR3, ccr3); /* restore CCR3 */ + + /* read DIR0 and DIR1 CPU registers */ + *dir0 = getCx86(CX86_DIR0); + *dir1 = getCx86(CX86_DIR1); + } +} + +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} +/* + * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in + * order to identify the Cyrix CPU model after we're out of setup.c + * + * Actually since bugs.h doesn't even reference this perhaps someone should + * fix the documentation ??? + */ +static unsigned char Cx86_dir0_msb = 0; + +static const char Cx86_model[][9] = { + "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", + "M II ", "Unknown" +}; +static const char Cx486_name[][5] = { + "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", + "SRx2", "DRx2" +}; +static const char Cx486S_name[][4] = { + "S", "S2", "Se", "S2e" +}; +static const char Cx486D_name[][4] = { + "DX", "DX2", "?", "?", "?", "DX4" +}; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; + +/* + * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old + * BIOSes for compatibility with DOS games. This makes the udelay loop + * work correctly, and improves performance. + * + * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP + */ + +static void check_cx686_slop(struct cpuinfo_x86 *c) +{ + unsigned long flags; + + if (Cx86_dir0_msb == 3) { + unsigned char ccr3, ccr5; + + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ccr5 = getCx86(CX86_CCR5); + if (ccr5 & 2) + setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + local_irq_restore(flags); + + if (ccr5 & 2) { /* possible wrong calibration done */ + pr_info("Recalibrating delay loop with SLOP bit reset\n"); + calibrate_delay(); + c->loops_per_jiffy = loops_per_jiffy; + } + } +} + + +static void set_cx86_reorder(void) +{ + u8 ccr3; + + pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n"); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + /* Load/Store Serialize to mem access disable (=reorder it) */ + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + /* set load/store serialize from 1GB to 4GB */ + ccr3 |= 0xe0; + setCx86(CX86_CCR3, ccr3); +} + +static void set_cx86_memwb(void) +{ + pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + + /* CCR2 bit 2: unlock NW bit */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + /* set 'Not Write-through' */ + write_cr0(read_cr0() | X86_CR0_NW); + /* CCR2 bit 2: lock NW bit and set WT1 */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); +} + +/* + * Configure later MediaGX and/or Geode processor. + */ + +static void geode_configure(void) +{ + unsigned long flags; + u8 ccr3; + local_irq_save(flags); + + /* Suspend on halt power saving and enable #SUSP pin */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + + /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + set_cx86_memwb(); + set_cx86_reorder(); + + local_irq_restore(flags); +} + +static void early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} + +static void init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; + char *buf = c->x86_model_id; + const char *p = NULL; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */ + if (test_cpu_cap(c, 1*32+24)) { + clear_cpu_cap(c, 1*32+24); + set_cpu_cap(c, X86_FEATURE_CXMMX); + } + + do_cyrix_devid(&dir0, &dir1); + + check_cx686_slop(c); + + Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + dir0_lsn = dir0 & 0xf; /* model or clock multiplier */ + + /* common case step number/rev -- exceptions handled below */ + c->x86_model = (dir1 >> 4) + 1; + c->x86_stepping = dir1 & 0xf; + + /* Now cook; the original recipe is by Channing Corn, from Cyrix. + * We do the same thing for each generation: we work out + * the model, multiplier and stepping. Black magic included, + * to make the silicon step/rev numbers match the printed ones. + */ + + switch (dir0_msn) { + unsigned char tmp; + + case 0: /* Cx486SLC/DLC/SRx/DRx */ + p = Cx486_name[dir0_lsn & 7]; + break; + + case 1: /* Cx486S/DX/DX2/DX4 */ + p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5] + : Cx486S_name[dir0_lsn & 3]; + break; + + case 2: /* 5x86 */ + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + p = Cx86_cb+2; + break; + + case 3: /* 6x86/6x86L */ + Cx86_cb[1] = ' '; + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + if (dir1 > 0x21) { /* 686L */ + Cx86_cb[0] = 'L'; + p = Cx86_cb; + (c->x86_model)++; + } else /* 686 */ + p = Cx86_cb+1; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + /* 6x86's contain this bug */ + set_cpu_bug(c, X86_BUG_COMA); + break; + + case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ + case 11: /* GX1 with inverted Device ID */ +#ifdef CONFIG_PCI + { + u32 vendor, device; + /* + * It isn't really a PCI quirk directly, but the cure is the + * same. The MediaGX has deep magic SMM stuff that handles the + * SB emulation. It throws away the fifo on disable_dma() which + * is wrong and ruins the audio. + * + * Bug2: VSA1 has a wrap bug so that using maximum sized DMA + * causes bad things. According to NatSemi VSA2 has another + * bug to do with 'hlt'. I've not seen any boards using VSA2 + * and X doesn't seem to support it either so who cares 8). + * VSA1 we work around however. + */ + + pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n"); + isa_dma_bridge_buggy = 2; + + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); + + /* + * The 5510/5520 companion chips have a funky PIT. + */ + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) + mark_tsc_unstable("cyrix 5510/5520 detected"); + } +#endif + c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + + /* GXm supports extended cpuid levels 'ala' AMD */ + if (c->cpuid_level == 2) { + /* Enable cxMMX extensions (GX1 Datasheet 54) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); + + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) + geode_configure(); + return; + } else { /* MediaGX */ + Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; + p = Cx86_cb+2; + c->x86_model = (dir1 & 0x20) ? 1 : 2; + } + break; + + case 5: /* 6x86MX/M II */ + if (dir1 > 7) { + dir0_msn++; /* M II */ + /* Enable MMX extensions (App note 108) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + } else { + /* A 6x86MX - it has the bug. */ + set_cpu_bug(c, X86_BUG_COMA); + } + tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0; + Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7]; + p = Cx86_cb+tmp; + if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20)) + (c->x86_model)++; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + + case 0xf: /* Cyrix 486 without DEVID registers */ + switch (dir0_lsn) { + case 0xd: /* either a 486SLC or DLC w/o DEVID */ + dir0_msn = 0; + p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)]; + break; + + case 0xe: /* a 486S A step */ + dir0_msn = 0; + p = Cx486S_name[0]; + break; + } + break; + + default: /* unknown (shouldn't happen, we know everyone ;-) */ + dir0_msn = 7; + break; + } + strcpy(buf, Cx86_model[dir0_msn & 7]); + if (p) + strcat(buf, p); + return; +} + +/* + * Handle National Semiconductor branded processors + */ +static void init_nsc(struct cpuinfo_x86 *c) +{ + /* + * There may be GX1 processors in the wild that are branded + * NSC and not Cyrix. + * + * This function only handles the GX processor, and kicks every + * thing else to the Cyrix init function above - that should + * cover any processors that might have been branded differently + * after NSC acquired Cyrix. + * + * If this breaks your GX1 horribly, please e-mail + * info-linux@ldcmail.amd.com to tell us. + */ + + /* Handle the GX (Formally known as the GX2) */ + + if (c->x86 == 5 && c->x86_model == 5) + cpu_detect_cache_sizes(c); + else + init_cyrix(c); +} + +/* + * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected + * by the fact that they preserve the flags across the division of 5/2. + * PII and PPro exhibit this behavior too, but they have cpuid available. + */ + +/* + * Perform the Cyrix 5/2 test. A Cyrix won't change + * the flags, while other 486 chips will. + */ +static inline int test_cyrix_52div(void) +{ + unsigned int test; + + __asm__ __volatile__( + "sahf\n\t" /* clear flags (%eax = 0x0005) */ + "div %b2\n\t" /* divide 5 by 2 */ + "lahf" /* store flags into %ah */ + : "=a" (test) + : "0" (5), "q" (2) + : "cc"); + + /* AH is 0x02 on Cyrix after the divide.. */ + return (unsigned char) (test >> 8) == 0x02; +} + +static void cyrix_identify(struct cpuinfo_x86 *c) +{ + /* Detect Cyrix with disabled CPUID */ + if (c->x86 == 4 && test_cyrix_52div()) { + unsigned char dir0, dir1; + + strcpy(c->x86_vendor_id, "CyrixInstead"); + c->x86_vendor = X86_VENDOR_CYRIX; + + /* Actually enable cpuid on the older cyrix */ + + /* Retrieve CPU revisions */ + + do_cyrix_devid(&dir0, &dir1); + + dir0 >>= 4; + + /* Check it is an affected model */ + + if (dir0 == 5 || dir0 == 3) { + unsigned char ccr3; + unsigned long flags; + pr_info("Enabling CPUID on Cyrix processor.\n"); + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); + local_irq_restore(flags); + } + } +} + +static const struct cpu_dev cyrix_cpu_dev = { + .c_vendor = "Cyrix", + .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, + .c_init = init_cyrix, + .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, +}; + +cpu_dev_register(cyrix_cpu_dev); + +static const struct cpu_dev nsc_cpu_dev = { + .c_vendor = "NSC", + .c_ident = { "Geode by NSC" }, + .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, +}; + +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c new file mode 100644 index 000000000..d75418512 --- /dev/null +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "x86/cpu: " fmt + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +enum vmx_feature_leafs { + MISC_FEATURES = 0, + PRIMARY_CTLS, + SECONDARY_CTLS, + NR_VMX_FEATURE_WORDS, +}; + +#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f) + +static void init_vmx_capabilities(struct cpuinfo_x86 *c) +{ + u32 supported, funcs, ept, vpid, ign; + + BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); + + /* + * The high bits contain the allowed-1 settings, i.e. features that can + * be turned on. The low bits contain the allowed-0 settings, i.e. + * features that can be turned off. Ignore the allowed-0 settings, + * if a feature can be turned on then it's supported. + * + * Use raw rdmsr() for primary processor controls and pin controls MSRs + * as they exist on any CPU that supports VMX, i.e. we want the WARN if + * the RDMSR faults. + */ + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported); + c->vmx_capability[PRIMARY_CTLS] = supported; + + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); + c->vmx_capability[SECONDARY_CTLS] = supported; + + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); + rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); + + /* + * Except for EPT+VPID, which enumerates support for both in a single + * MSR, low for EPT, high for VPID. + */ + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid); + + /* Pin, EPT, VPID and VM-Func are merged into a single word. */ + WARN_ON_ONCE(supported >> 16); + WARN_ON_ONCE(funcs >> 4); + c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) | + ((vpid & 0x1) << 16) | + ((funcs & 0xf) << 28); + + /* EPT bits are full on scattered and must be manually handled. */ + if (ept & VMX_EPT_EXECUTE_ONLY_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY); + if (ept & VMX_EPT_AD_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); + if (ept & VMX_EPT_1GB_PAGE_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + + /* Synthetic APIC features that are aggregates of multiple features. */ + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY); + + if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) && + (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) && + (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR))) + c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV); + + /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */ + if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS)) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT)) + set_cpu_cap(c, X86_FEATURE_EPT); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD)) + set_cpu_cap(c, X86_FEATURE_EPT_AD); + if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID)) + set_cpu_cap(c, X86_FEATURE_VPID); +} +#endif /* CONFIG_X86_VMX_FEATURE_NAMES */ + +void init_ia32_feat_ctl(struct cpuinfo_x86 *c) +{ + bool tboot = tboot_enabled(); + u64 msr; + + if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + clear_cpu_cap(c, X86_FEATURE_VMX); + return; + } + + if (msr & FEAT_CTL_LOCKED) + goto update_caps; + + /* + * Ignore whatever value BIOS left in the MSR to avoid enabling random + * features or faulting on the WRMSR. + */ + msr = FEAT_CTL_LOCKED; + + /* + * Enable VMX if and only if the kernel may do VMXON at some point, + * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector + * for the kernel, e.g. using VMX to hide malicious code. + */ + if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; + + if (tboot) + msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; + } + + wrmsrl(MSR_IA32_FEAT_CTL, msr); + +update_caps: + set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); + + if (!cpu_has(c, X86_FEATURE_VMX)) + return; + + if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || + (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { + if (IS_ENABLED(CONFIG_KVM_INTEL)) + pr_err_once("VMX (%s TXT) disabled by BIOS\n", + tboot ? "inside" : "outside"); + clear_cpu_cap(c, X86_FEATURE_VMX); + } else { +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + init_vmx_capabilities(c); +#endif + } +} diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c new file mode 100644 index 000000000..3f5c00b15 --- /dev/null +++ b/arch/x86/kernel/cpu/hygon.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Hygon Processor Support for Linux + * + * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd. + * + * Author: Pu Wen + */ +#include + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +# include +#endif + +#include "cpu.h" + +#define APICID_SOCKET_ID_BIT 6 + +/* + * nodes_per_socket: Stores the number of nodes per socket. + * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] + */ +static u32 nodes_per_socket = 1; + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +static void hygon_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + +/* + * Fixup core topology information for + * (1) Hygon multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) Hygon processors supporting compute units + */ +static void hygon_get_topology(struct cpuinfo_x86 *c) +{ + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int err; + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + + c->cpu_die_id = ecx & 0xff; + + c->cpu_core_id = ebx & 0xff; + + if (smp_num_siblings > 1) + c->x86_max_cores /= smp_num_siblings; + + /* + * In case leaf B is available, use it to derive + * topology information. + */ + err = detect_extended_topology(c); + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + + /* + * Socket ID is ApicId[6] for the processors with model <= 0x3 + * when running on host. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + + cacheinfo_hygon_init_llc_id(c, cpu); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + c->cpu_die_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + } else + return; + + if (nodes_per_socket > 1) + set_cpu_cap(c, X86_FEATURE_AMD_DCM); +} + +/* + * On Hygon setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void hygon_detect_cmp(struct cpuinfo_x86 *c) +{ + unsigned int bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; +} + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned int apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs. + * Assume they are all increased by a constant offset, but + * in the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_hygon_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_hygon(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + pr_debug("tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } +#endif + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); + } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes_per_socket = ((value >> 3) & 7) + 1; + } + + if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && + !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; + } + } +} + +static void early_init_hygon(struct cpuinfo_x86 *c) +{ + u32 dummy; + + early_init_hygon_mc(c); + + set_cpu_cap(c, X86_FEATURE_K8); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* + * ApicID can always be treated as an 8-bit value for Hygon APIC So, we + * can safely set X86_FEATURE_EXTD_APICID unconditionally. + */ + if (boot_cpu_has(X86_FEATURE_APIC)) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); +#endif + + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + + hygon_get_topology_early(c); +} + +static void init_hygon(struct cpuinfo_x86 *c) +{ + early_init_hygon(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); + + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + + set_cpu_cap(c, X86_FEATURE_ZEN); + set_cpu_cap(c, X86_FEATURE_CPB); + + cpu_detect_cache_sizes(c); + + hygon_detect_cmp(c); + hygon_get_topology(c); + srat_detect_node(c); + + init_hygon_cacheinfo(c); + + if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * Use LFENCE for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); + + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } + + /* + * Hygon processors have APIC timer running in deep C states. + */ + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); +} + +static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; +} + +static const struct cpu_dev hygon_cpu_dev = { + .c_vendor = "Hygon", + .c_ident = { "HygonGenuine" }, + .c_early_init = early_init_hygon, + .c_detect_tlb = cpu_detect_tlb_hygon, + .c_bsp_init = bsp_init_hygon, + .c_init = init_hygon, + .c_x86_vendor = X86_VENDOR_HYGON, +}; + +cpu_dev_register(hygon_cpu_dev); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 000000000..553bfbfc3 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,109 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include + +static const __initconst struct hypervisor_x86 * const hypervisors[] = +{ +#ifdef CONFIG_XEN_PV + &x86_hyper_xen_pv, +#endif +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, +#endif + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif +#ifdef CONFIG_JAILHOUSE_GUEST + &x86_hyper_jailhouse, +#endif +#ifdef CONFIG_ACRN_GUEST + &x86_hyper_acrn, +#endif +}; + +enum x86_hypervisor_type x86_hyper_type; +EXPORT_SYMBOL(x86_hyper_type); + +bool __initdata nopv; +static __init int parse_nopv(char *arg) +{ + nopv = true; + return 0; +} +early_param("nopv", parse_nopv); + +static inline const struct hypervisor_x86 * __init +detect_hypervisor_vendor(void) +{ + const struct hypervisor_x86 *h = NULL, * const *p; + uint32_t pri, max_pri = 0; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + if (unlikely(nopv) && !(*p)->ignore_nopv) + continue; + + pri = (*p)->detect(); + if (pri > max_pri) { + max_pri = pri; + h = *p; + } + } + + if (h) + pr_info("Hypervisor detected: %s\n", h->name); + + return h; +} + +static void __init copy_array(const void *src, void *target, unsigned int size) +{ + unsigned int i, n = size / sizeof(void *); + const void * const *from = (const void * const *)src; + const void **to = (const void **)target; + + for (i = 0; i < n; i++) + if (from[i]) + to[i] = from[i]; +} + +void __init init_hypervisor_platform(void) +{ + const struct hypervisor_x86 *h; + + h = detect_hypervisor_vendor(); + + if (!h) + return; + + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + + x86_hyper_type = h->type; + x86_init.hyper.init_platform(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c new file mode 100644 index 000000000..c6ad53e38 --- /dev/null +++ b/arch/x86/kernel/cpu/intel.c @@ -0,0 +1,1194 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif + +#include "cpu.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#include +#endif + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, +}; + +/* + * Default to sld_off because most systems do not support split lock detection + * split_lock_setup() will switch this to sld_warn on systems that support + * split lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + +static bool ring3mwait_disabled __read_mostly; + +static int __init ring3mwait_disable(char *__unused) +{ + ring3mwait_disabled = true; + return 1; +} +__setup("ring3mwait=disable", ring3mwait_disable); + +static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) +{ + /* + * Ring 3 MONITOR/MWAIT feature cannot be detected without + * cpu model and family comparison. + */ + if (c->x86 != 6) + return; + switch (c->x86_model) { + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + break; + default: + return; + } + + if (ring3mwait_disabled) + return; + + set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); + + if (c == &boot_cpu_data) + ELF_HWCAP2 |= HWCAP2_RING3MWAIT; +} + +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + /* + * We know that the hypervisor lie to us on the microcode version so + * we may as well hope that it is running the correct version. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return false; + + if (c->x86 != 6) + return false; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_stepping == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + +static void early_init_intel(struct cpuinfo_x86 *c) +{ + u64 misc_enable; + + /* Unmask CPUID levels if masked: */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { + c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); + } + } + + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) + c->microcode = intel_get_microcode_revision(); + + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_INTEL_STIBP) || + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + setup_clear_cpu_cap(X86_FEATURE_IBRS); + setup_clear_cpu_cap(X86_FEATURE_IBPB); + setup_clear_cpu_cap(X86_FEATURE_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); + } + + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && + c->microcode < 0x20e) { + pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif + + /* CPUID workaround for 0F33/0F34 CPU */ + if (c->x86 == 0xF && c->x86_model == 0x3 + && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) + c->x86_phys_bits = 36; + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * It is also reliable across cores and sockets. (but not across + * cabinets - we turn it off in that case explicitly.) + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ + if (c->x86 == 6) { + switch (c->x86_model) { + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: + case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT_NP: + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; + default: + break; + } + } + + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 == 6 && c->x86_model < 15) + clear_cpu_cap(c, X86_FEATURE_PAT); + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + pr_info("Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } + + /* + * Intel Quark Core DevMan_001.pdf section 6.4.11 + * "The operating system also is required to invalidate (i.e., flush) + * the TLB when any changes are made to any of the page table entries. + * The operating system must reload CR3 to cause the TLB to be flushed" + * + * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h + * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified. + */ + if (c->x86 == 5 && c->x86_model == 9) { + pr_info("Disabling PGE capability bit\n"); + setup_clear_cpu_cap(X86_FEATURE_PGE); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + + check_memory_type_self_snoop_errata(c); + + /* + * Get the number of SMT siblings early from the extended topology + * leaf, if available. Otherwise try the legacy SMT detection. + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); +} + +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + +#ifdef CONFIG_X86_32 +/* + * Early probe support logic for ppro memory erratum #50 + * + * This is called before we do cpu ident work + */ + +int ppro_with_ram_bug(void) +{ + /* Uses data from early_cpu_detect now */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_stepping < 8) { + pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + return 1; + } + return 0; +} + +static void intel_smp_check(struct cpuinfo_x86 *c) +{ + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_stepping >= 1 && c->x86_stepping <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +} + +static int forcepae; +static int __init forcepae_setup(char *__unused) +{ + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); + +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_F00F_BUG + /* + * All models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. Announce that the fault handler will be checking for it. + * The Quark is also family 5, but does not have the same bug. + */ + clear_cpu_bug(c, X86_BUG_F00F); + if (c->x86 == 5 && c->x86_model < 9) { + static int f00f_workaround_enabled; + + set_cpu_bug(c, X86_BUG_F00F); + if (!f00f_workaround_enabled) { + pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + pr_warn("PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* + * P4 Xeon erratum 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); + } + } + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) + set_cpu_bug(c, X86_BUG_11AP); + + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + + intel_smp_check(c); +} +#else +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +} +#endif + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE || !node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } + numa_set_node(cpu, node); +#endif +} + +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + +static void split_lock_init(void); + +static void init_intel(struct cpuinfo_x86 *c) +{ + early_init_intel(c); + + intel_workarounds(c); + + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + + init_intel_cacheinfo(c); + + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (cpu_has(c, X86_FEATURE_XMM2)) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + + if (boot_cpu_has(X86_FEATURE_DS)) { + unsigned int l1, l2; + + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + } + + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && + (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); + + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && + ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + set_cpu_bug(c, X86_BUG_MONITOR); + +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + /* + * Names for the Pentium II/Celeron processors + * detectable only by also checking the cache size. + * Dixon is NOT a Celeron. + */ + if (c->x86 == 6) { + unsigned int l2 = c->x86_cache_size; + char *p = NULL; + + switch (c->x86_model) { + case 5: + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; + break; + + case 6: + if (l2 == 128) + p = "Celeron (Mendocino)"; + else if (c->x86_stepping == 0 || c->x86_stepping == 5) + p = "Celeron-A"; + break; + + case 8: + if (l2 == 128) + p = "Celeron (Coppermine)"; + break; + } + + if (p) + strcpy(c->x86_model_id, p); + } + + if (c->x86 == 15) + set_cpu_cap(c, X86_FEATURE_P4); + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_P3); +#endif + + /* Work around errata */ + srat_detect_node(c); + + init_ia32_feat_ctl(c); + + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme(c); + + init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); + + split_lock_init(); +} + +#ifdef CONFIG_X86_32 +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* + * Intel PIII Tualatin. This comes in two flavours. + * One has 256kb of cache, the other 512. We have no way + * to determine which, so we use a boottime override + * for the 512kb model, and assume 256 otherwise. + */ + if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + size = 256; + + /* + * Intel Quark SoC X1000 contains a 4-way set associative + * 16K cache with a 16 byte cache line and 256 lines per tag + */ + if ((c->x86 == 5) && (c->x86_model == 9)) + size = 16; + return size; +} +#endif + +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +static const struct _tlb_table intel_tlb_table[] = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, + { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, + { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, + { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, + { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case STLB_4K_2M: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_1G: + if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } +} + +static const struct cpu_dev intel_cpu_dev = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [0] = "486 DX-25/33", + [1] = "486 DX-50", + [2] = "486 SX", + [3] = "486 DX/2", + [4] = "486 SL", + [5] = "486 SX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB" + } + }, + { .family = 5, .model_names = + { + [0] = "Pentium 60/66 A-step", + [1] = "Pentium 60/66", + [2] = "Pentium 75 - 200", + [3] = "OverDrive PODP5V83", + [4] = "Pentium MMX", + [7] = "Mobile Pentium 75 - 200", + [8] = "Mobile Pentium MMX", + [9] = "Quark SoC X1000", + } + }, + { .family = 6, .model_names = + { + [0] = "Pentium Pro A-step", + [1] = "Pentium Pro", + [3] = "Pentium II (Klamath)", + [4] = "Pentium II (Deschutes)", + [5] = "Pentium II (Deschutes)", + [6] = "Mobile Pentium II", + [7] = "Pentium III (Katmai)", + [8] = "Pentium III (Coppermine)", + [10] = "Pentium III (Cascades)", + [11] = "Pentium III (Tualatin)", + } + }, + { .family = 15, .model_names = + { + [0] = "Pentium 4 (Unknown)", + [1] = "Pentium 4 (Willamette)", + [2] = "Pentium 4 (Northwood)", + [4] = "Pentium 4 (Foster)", + [5] = "Pentium 4 (Foster)", + } + }, + }, + .legacy_cache_size = intel_size_cache, +#endif + .c_detect_tlb = intel_detect_tlb, + .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, + .c_init = init_intel, + .c_x86_vendor = X86_VENDOR_INTEL, +}; + +cpu_dev_register(intel_cpu_dev); + +#undef pr_fmt +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, +}; + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init split_lock_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + + switch (state) { + case sld_off: + pr_info("disabled\n"); + return; + case sld_warn: + pr_info("warning about user-space split_locks\n"); + break; + case sld_fatal: + pr_info("sending SIGBUS on user-space split_locks\n"); + break; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + sld_state = state; + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +static void split_lock_init(void) +{ + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void split_lock_warn(unsigned long ip) +{ + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + + /* + * Disable the split lock detection for this task so it can make + * progress and set TIF_SLD so the detection is re-enabled via + * switch_to_sld() when the task is scheduled out. + */ + sld_update_msr(false); + set_tsk_thread_flag(current, TIF_SLD); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +/* + * This function is called only when switching between tasks with + * different split-lock detection modes. It sets the MSR for the + * mode of the new task. This is right most of the time, but since + * the MSR is shared by hyperthreads on a physical core there can + * be glitches when the two threads need different modes. + */ +void switch_to_sld(unsigned long tifn) +{ + sld_update_msr(!(tifn & _TIF_SLD)); +} + +/* + * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should + * only be trusted if it is confirmed that a CPU model implements a + * specific feature at a particular bit position. + * + * The possible driver data field values: + * + * - 0: CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + * + * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use + * bit 5 to enumerate the per-core split-lock detection feature. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + {} +}; + +void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + m = x86_match_cpu(split_lock_cpu_ids); + if (!m) + return; + + switch (m->driver_data) { + case 0: + break; + case 1: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) + return; + break; + default: + return; + } + + cpu_model_supports_sld = true; + split_lock_setup(); +} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000..f4dd73396 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + +static int intel_epb_online(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c new file mode 100644 index 000000000..0771a905b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel PCONFIG instruction support. + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Kirill A. Shutemov + */ + +#include +#include + +#define PCONFIG_CPUID 0x1b + +#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) + +/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ +enum { + PCONFIG_CPUID_SUBLEAF_INVALID = 0, + PCONFIG_CPUID_SUBLEAF_TARGETID = 1, +}; + +/* Bitmask of supported targets */ +static u64 targets_supported __read_mostly; + +int pconfig_target_supported(enum pconfig_target target) +{ + /* + * We would need to re-think the implementation once we get > 64 + * PCONFIG targets. Spec allows up to 2^32 targets. + */ + BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); + + if (WARN_ON_ONCE(target >= 64)) + return 0; + return targets_supported & (1ULL << target); +} + +static int __init intel_pconfig_init(void) +{ + int subleaf; + + if (!boot_cpu_has(X86_FEATURE_PCONFIG)) + return 0; + + /* + * Scan subleafs of PCONFIG CPUID leaf. + * + * Subleafs of the same type need not to be consecutive. + * + * Stop on the first invalid subleaf type. All subleafs after the first + * invalid are invalid too. + */ + for (subleaf = 0; subleaf < INT_MAX; subleaf++) { + struct cpuid_regs regs; + + cpuid_count(PCONFIG_CPUID, subleaf, + ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); + + switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { + case PCONFIG_CPUID_SUBLEAF_INVALID: + /* Stop on the first invalid subleaf */ + goto out; + case PCONFIG_CPUID_SUBLEAF_TARGETID: + /* Mark supported PCONFIG targets */ + if (regs.ebx < 64) + targets_supported |= (1ULL << regs.ebx); + if (regs.ecx < 64) + targets_supported |= (1ULL << regs.ecx); + if (regs.edx < 64) + targets_supported |= (1ULL << regs.edx); + break; + default: + /* Unknown CPUID.PCONFIG subleaf: ignore */ + break; + } + } +out: + return 0; +} +arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 000000000..ad6776081 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); + +static const struct x86_cpu_desc * +x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + const struct x86_cpu_desc *m; + + for (m = match; m->x86_family | m->x86_model; m++) { + if (c->x86_vendor != m->x86_vendor) + continue; + if (c->x86 != m->x86_family) + continue; + if (c->x86_model != m->x86_model) + continue; + if (c->x86_stepping != m->x86_stepping) + continue; + return m; + } + return NULL; +} + +bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +{ + const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + + if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile new file mode 100644 index 000000000..9f020c994 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y = core.o severity.o genpool.o + +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL) += intel.o +obj-$(CONFIG_X86_MCE_AMD) += amd.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o + +mce-inject-y := inject.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c new file mode 100644 index 000000000..cd8db6b9c --- /dev/null +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -0,0 +1,1541 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (c) 2005-2016 Advanced Micro Devices, Inc. + * + * Written by Jacob Shin - AMD, Inc. + * Maintained by: Borislav Petkov + * + * All MC4_MISCi registers are shared between cores on a node. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define NR_BLOCKS 5 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +static bool thresholding_irq_en; + +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "decode_unit", + "northbridge", + "execution_unit", +}; + +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" +}; + +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_LS_V2] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, + [SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, +}; + +static const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static enum smca_bank_types smca_get_bank_type(unsigned int bank) +{ + struct smca_bank *b; + + if (bank >= MAX_NR_BANKS) + return N_SMCA_BANK_TYPES; + + b = &smca_banks[bank]; + if (!b->hwid) + return N_SMCA_BANK_TYPES; + + return b->hwid->bank_type; +} + +static struct smca_hwid smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype } */ + + /* Reserved type */ + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, + + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, + + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, +}; + +struct smca_bank smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; + +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); + +/* + * A list of the banks enabled on each logical CPU. Controls which respective + * descriptors to initialize later in mce_threshold_create_device(). + */ +static DEFINE_PER_CPU(u64, bank_map); + +/* Map of banks that have more than MCA_MISC0 available. */ +static DEFINE_PER_CPU(u64, smca_misc_banks_map); + +static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; + +static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) +{ + u32 low, high; + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return; + + if (!(low & MCI_CONFIG_MCAX)) + return; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) + return; + + if (low & MASK_BLKPTR_LO) + per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); + +} + +static void smca_configure(unsigned int bank, unsigned int cpu) +{ + unsigned int i, hwid_mcatype; + struct smca_hwid *s_hwid; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } + + smca_set_misc_banks_map(bank, cpu); + + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) + return; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = low; + smca_banks[bank].sysfs_id = s_hwid->count++; + break; + } + } +} + +struct thresh_restart { + struct threshold_block *b; + int reset; + int set_lvt_off; + int lvt_off; + u16 old_limit; +}; + +static inline bool is_shared_bank(int bank) +{ + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char *bank4_names(const struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* Reprogram MCx_MISC MSR behind this threshold bank. */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 hi, lo; + + /* sysfs write might race against an offline operation */ + if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off) + return; + + rdmsr(tr->b->address, lo, hi); + + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + hi = (hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce_threshold(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + if (!mce_flags.smca) + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + + wrmsr(MSR_CU_DEF_ERR, low, high); +} + +static u32 smca_get_block_address(unsigned int bank, unsigned int block, + unsigned int cpu) +{ + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); +} + +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block, + unsigned int cpu) +{ + u32 addr = 0, offset = 0; + + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) + return addr; + + if (mce_flags.smca) + return smca_get_block_address(bank, block, cpu); + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = mca_msr_reg(bank, MCA_MISC); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + u32 smca_low, smca_high; + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= BIT_ULL(bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (!mce_flags.smca) { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + goto set_offset; + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + +set_offset: + offset = setup_APIC_mce_threshold(offset, new); + if (offset == new) + thresholding_irq_en = true; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + +bool amd_filter_mce(struct mce *m) +{ + enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* See Family 17h Models 10h-2Fh Erratum #1114. */ + if (c->x86 == 0x17 && + c->x86_model >= 0x10 && c->x86_model <= 0x2F && + bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10) + return true; + + /* NB GART TLB error reporting is disabled by default. */ + if (c->x86 < 0x17) { + if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5) + return true; + } + + return false; +} + +/* + * Turn off thresholding banks for the following conditions: + * - MC4_MISC thresholding is not supported on Family 0x15. + * - Prevent possible spurious interrupts from the IF bank on Family 0x17 + * Models 0x10-0x2F due to Erratum #1114. + */ +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +{ + int i, num_msrs; + u64 hwcr; + bool need_toggle; + u32 msrs[NR_BLOCKS]; + + if (c->x86 == 0x15 && bank == 4) { + msrs[0] = 0x00000413; /* MC4_MISC0 */ + msrs[1] = 0xc0000408; /* MC4_MISC1 */ + num_msrs = 2; + } else if (c->x86 == 0x17 && + (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { + + if (smca_get_bank_type(bank) != SMCA_IF) + return; + + msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); + num_msrs = 1; + } else { + return; + } + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < num_msrs; i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + unsigned int bank, block, cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + int offset = -1; + + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { + if (mce_flags.smca) + smca_configure(bank, cpu); + + disable_err_thresholding(c, bank); + + for (block = 0; block < NR_BLOCKS; ++block) { + address = get_block_address(address, low, high, bank, block, cpu); + if (!address) + break; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) + continue; + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + offset = prepare_threshold_block(bank, block, address, offset, high); + } + } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + +bool amd_mce_is_memory_error(struct mce *m) +{ + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + if (mce_flags.smca) + return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + + return m->bank == 4 && xec == 0x8; +} + +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) +{ + struct mce m; + + mce_setup(&m); + + m.status = status; + m.misc = misc; + m.bank = bank; + m.tsc = rdtsc(); + + if (m.status & MCI_STATUS_ADDRV) { + m.addr = addr; + + /* + * Extract [55:] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + + mce_log(&m); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) +{ + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + ack_APIC_irq(); +} + +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) +{ + u64 status, addr = 0; + + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; + + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); + + __log_error(bank, status, addr, misc); + + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; +} + +static bool _log_error_deferred(unsigned int bank, u32 misc) +{ + if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), misc)) + return false; + + /* + * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. + * Return true here to avoid accessing these registers. + */ + if (!mce_flags.smca) + return true; + + /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return true; +} + +/* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static void log_error_deferred(unsigned int bank) +{ + if (_log_error_deferred(bank, 0)) + return; + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_deferred(bank, misc); +} + +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; + + if (!block) + return; + + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; + + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} + +/* + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. + */ +static void amd_threshold_interrupt(void) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); + unsigned int bank, cpu = smp_processor_id(); + + /* + * Validate that the threshold bank has been initialized already. The + * handler is installed at boot time, but on a hotplug event the + * interrupt might fire before the data has been initialized. + */ + if (!bp) + return; + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { + if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) + continue; + + first_block = bp[bank]->blocks; + if (!first_block) + continue; + + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (!b->interrupt_capable) + return -EINVAL; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + memset(&tr, 0, sizeof(tr)); + tr.b = b; + + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + memset(&tr, 0, sizeof(tr)); + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; + + return size; +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + u32 lo, hi; + + /* CPU might be offline by now */ + if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi)) + return -ENODEV; + + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); +} + +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); + +static struct attribute *default_attrs[] = { + &threshold_limit.attr, + &error_count.attr, + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static const struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static void threshold_block_release(struct kobject *kobj); + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, + .release = threshold_block_release, +}; + +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + enum smca_bank_types bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + bank_type = smca_get_bank_type(bank); + if (bank_type >= N_SMCA_BANK_TYPES) + return NULL; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].sysfs_id); + return buf_mcatype; +} + +static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb, + unsigned int bank, unsigned int block, + u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe(address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); + b->threshold_limit = THRESHOLD_MAX; + + if (b->interrupt_capable) { + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + b->interrupt_enable = 1; + } else { + threshold_ktype.default_attrs[2] = NULL; + } + + INIT_LIST_HEAD(&b->miscj); + + /* This is safe as @tb is not visible yet */ + if (tb->blocks) + list_add(&b->miscj, &tb->blocks->miscj); + else + tb->blocks = b; + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); + if (err) + goto out_free; +recurse: + address = get_block_address(address, low, high, bank, ++block, cpu); + if (!address) + return 0; + + err = allocate_threshold_blocks(cpu, tb, bank, block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return 0; + +out_free: + if (b) { + list_del(&b->miscj); + kobject_put(&b->kobj); + } + return err; +} + +static int __threshold_add_blocks(struct threshold_bank *b) +{ + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; +} + +static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, + unsigned int bank) +{ + struct device *dev = this_cpu_read(mce_device); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = get_name(bank, NULL); + int err = 0; + + if (!dev) + return -ENODEV; + + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; + + bp[bank] = b; + refcount_inc(&b->cpus); + + err = __threshold_add_blocks(b); + + goto out; + } + } + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + + /* Associate the bank with the per-CPU MCE device */ + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; + goto out_free; + } + + if (is_shared_bank(bank)) { + b->shared = 1; + refcount_set(&b->cpus, 1); + + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } + } + + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); + if (err) + goto out_kobj; + + bp[bank] = b; + return 0; + +out_kobj: + kobject_put(b->kobj); +out_free: + kfree(b); +out: + return err; +} + +static void threshold_block_release(struct kobject *kobj) +{ + kfree(to_block(kobj)); +} + +static void deallocate_threshold_blocks(struct threshold_bank *bank) +{ + struct threshold_block *pos, *tmp; + + list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_del(&pos->miscj); + kobject_put(&pos->kobj); + } + + kobject_put(&bank->blocks->kobj); +} + +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + +static void threshold_remove_bank(struct threshold_bank *bank) +{ + struct amd_northbridge *nb; + + if (!bank->blocks) + goto out_free; + + if (!bank->shared) + goto out_dealloc; + + if (!refcount_dec_and_test(&bank->cpus)) { + __threshold_remove_blocks(bank); + return; + } else { + /* + * The last CPU on this node using the shared bank is going + * away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id())); + nb->bank4 = NULL; + } + +out_dealloc: + deallocate_threshold_blocks(bank); + +out_free: + kobject_put(bank->kobj); + kfree(bank); +} + +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + +int mce_threshold_remove_device(unsigned int cpu) +{ + struct threshold_bank **bp = this_cpu_read(threshold_banks); + + if (!bp) + return 0; + + /* + * Clear the pointer before cleaning up, so that the interrupt won't + * touch anything of this. + */ + this_cpu_write(threshold_banks, NULL); + + __threshold_remove_device(bp); + return 0; +} + +/** + * mce_threshold_create_device - Create the per-CPU MCE threshold device + * @cpu: The plugged in CPU + * + * Create directories and files for all valid threshold banks. + * + * This is invoked from the CPU hotplug callback which was installed in + * mcheck_init_device(). The invocation happens in context of the hotplug + * thread running on @cpu. The callback is invoked on all CPUs which are + * online when the callback is installed or during a real hotplug event. + */ +int mce_threshold_create_device(unsigned int cpu) +{ + unsigned int numbanks, bank; + struct threshold_bank **bp; + int err; + + if (!mce_flags.amd_threshold) + return 0; + + bp = this_cpu_read(threshold_banks); + if (bp) + return 0; + + numbanks = this_cpu_read(mce_num_banks); + bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); + if (!bp) + return -ENOMEM; + + for (bank = 0; bank < numbanks; ++bank) { + if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) + continue; + err = threshold_create_bank(bp, cpu, bank); + if (err) { + __threshold_remove_device(bp); + return err; + } + } + this_cpu_write(threshold_banks, bp); + + if (thresholding_irq_en) + mce_threshold_vector = amd_threshold_interrupt; + return 0; +} diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c new file mode 100644 index 000000000..af8d37962 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + mce_setup(&m); + m.bank = -1; + /* Fake a memory read error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + + if (severity >= GHES_SEV_PANIC) { + m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } + + m.addr = mem_err->physical_addr; + mce_log(&m); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SEV_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) + goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); + rc = sizeof(*m); +out: + erst_get_record_id_end(); + + return rc; +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c new file mode 100644 index 000000000..0b7c81389 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/core.c @@ -0,0 +1,2794 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* sysfs synchronization */ +static DEFINE_MUTEX(mce_sysfs_mutex); + +#define CREATE_TRACE_POINTS +#include + +#define SPINUNIT 100 /* 100ns */ + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +struct mce_bank { + u64 ctl; /* subevents to enable */ + bool init; /* initialise bank? */ +}; +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + +#define ATTR_LEN 16 +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank_dev { + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ + u8 bank; /* bank number */ +}; +static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; + +struct mce_vendor_flags mce_flags __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; + +static DEFINE_PER_CPU(struct mce, mces_seen); +static unsigned long mce_need_notify; +static int cpu_missing; + +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + +static struct work_struct mce_work; +static struct irq_work mce_irq_work; + +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); + +/* Do initial initialization of a struct mce */ +noinstr void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = m->extcpu = smp_processor_id(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); + m->socketid = cpu_data(m->extcpu).phys_proc_id; + m->apicid = cpu_data(m->extcpu).initial_apicid; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + m->ppin = __rdmsr(MSR_PPIN); + else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) + m->ppin = __rdmsr(MSR_AMD_PPIN); + + m->microcode = boot_cpu_data.microcode; +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +void mce_log(struct mce *m) +{ + if (!mce_gen_pool_add(m)) + irq_work_queue(&mce_irq_work); +} +EXPORT_SYMBOL_GPL(mce_log); + +void mce_register_decode_chain(struct notifier_block *nb) +{ + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) + return; + + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + +u32 mca_msr_reg(int bank, enum mca_msr reg) +{ + if (mce_flags.smca) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } + + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } + + return 0; +} + +static void __print_mce(struct mce *m) +{ + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); + + if (m->ip) { + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + + if (m->cs == __KERNEL_CS) + pr_cont("{%pS}", (void *)(unsigned long)m->ip); + pr_cont("\n"); + } + + pr_emerg(HW_ERR "TSC %llx ", m->tsc); + if (m->addr) + pr_cont("ADDR %llx ", m->addr); + if (m->misc) + pr_cont("MISC %llx ", m->misc); + if (m->ppin) + pr_cont("PPIN %llx ", m->ppin); + + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + + pr_cont("\n"); + + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + m->microcode); +} + +static void print_mce(struct mce *m) +{ + __print_mce(m); + + if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_panicked; + +static int fake_panic; +static atomic_t mce_fake_panicked; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic("Panicing machine check CPU died"); +} + +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +{ + struct llist_node *pending; + struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); + + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_panicked) > 1) + wait_for_panic(); + barrier(); + + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_panicked) > 1) + goto out; + } + pending = mce_gen_pool_prepare_records(); + /* First print corrected ones that are still unlogged */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + /* Now print uncorrected but with the final one last */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) + continue; + if (!final || mce_cmp(m, final)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + if (final) { + print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } + if (cpu_missing) + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); + if (exp) + pr_emerg(HW_ERR "Machine check: %s\n", exp); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic(msg); + } else + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __this_cpu_read(injectm.bank); + + if (msr == mca_cfg.rip_msr) + return offsetof(struct mce, ip); + if (msr == mca_msr_reg(bank, MCA_STATUS)) + return offsetof(struct mce, status); + if (msr == mca_msr_reg(bank, MCA_ADDR)) + return offsetof(struct mce, addr); + if (msr == mca_msr_reg(bank, MCA_MISC)) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + +/* MSR access wrappers used for error injection */ +static noinstr u64 mce_rdmsrl(u32 msr) +{ + DECLARE_ARGS(val, low, high); + + if (__this_cpu_read(injectm.finished)) { + int offset; + u64 ret; + + instrumentation_begin(); + + offset = msr_to_offset(msr); + if (offset < 0) + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); + + instrumentation_end(); + + return ret; + } + + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); +} + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + +static noinstr void mce_wrmsrl(u32 msr, u64 v) +{ + u32 low, high; + + if (__this_cpu_read(injectm.finished)) { + int offset; + + instrumentation_begin(); + + offset = msr_to_offset(msr); + if (offset >= 0) + *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + + return; + } + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +/* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ + if (!mce_gen_pool_empty()) + schedule_work(&mce_work); +} + +static void mce_irq_work_cb(struct irq_work *entry) +{ + mce_schedule_work(); +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granuality for now. + */ +int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_ADDRV)) + return 0; + + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return 1; + + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return 0; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(mce_usable_address); + +bool mce_is_memory_error(struct mce *m) +{ + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + return amd_mce_is_memory_error(m); + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + + default: + return false; + } +} +EXPORT_SYMBOL_GPL(mce_is_memory_error); + +static bool whole_page(struct mce *m) +{ + if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV)) + return true; + + return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT; +} + +bool mce_is_correctable(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(mce_is_correctable); + +static int mce_early_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block early_nb = { + .notifier_call = mce_early_notifier, + .priority = MCE_PRIO_EARLY, +}; + +static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce || !mce_usable_address(mce)) + return NOTIFY_DONE; + + if (mce->severity != MCE_AO_SEVERITY && + mce->severity != MCE_DEFERRED_SEVERITY) + return NOTIFY_DONE; + + pfn = mce->addr >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) { + set_mce_nospec(pfn, whole_page(mce)); + mce->kflags |= MCE_HANDLED_UC; + } + + return NOTIFY_OK; +} + +static struct notifier_block mce_uc_nb = { + .notifier_call = uc_decode_notifier, + .priority = MCE_PRIO_UC, +}; + +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (mca_cfg.print_all || !m->kflags) + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = MCE_PRIO_LOWEST, +}; + +/* + * Read ADDR and MISC registers. + */ +static noinstr void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + + /* + * Extract [55:] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + } +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll handler -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + bool error_seen = false; + struct mce m; + int i; + + this_cpu_inc(mce_poll_count); + + mce_gather_info(&m, NULL); + + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + if (!mce_banks[i].ctl || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + barrier(); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + + /* If this entry is not valid, ignore it */ + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: + error_seen = true; + + if (flags & MCP_DONTLOG) + goto clear_it; + + mce_read_aux(&m, i); + m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + goto clear_it; + + if (flags & MCP_QUEUE_LOG) + mce_gen_pool_add(&m); + else + mce_log(&m); + +clear_it: + /* + * Clear state for this bank. + */ + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ + + sync_core(); + + return error_seen; +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) +{ + char *tmp = *msg; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + + m->bank = i; + if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); + *msg = tmp; + return 1; + } + } + return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t, const char *msg) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_panicked)) + wait_for_panic(); + if (!mca_cfg.monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + if (mca_cfg.tolerant <= 1) + mce_panic(msg, NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of a machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); + mce_panic("Fatal machine check", m, msg); + } + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Fatal machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ + int order; + int cpus = num_online_cpus(); + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + return -1; + + atomic_add(*no_way_out, &global_nwo); + /* + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. + */ + order = atomic_inc_return(&mce_callin); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout, + "Timeout: Not all CPUs entered broadcast exception handler")) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout, + "Timeout: Subject CPUs unable to finish machine check processing")) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + } + + /* + * Cache the global no_way_out state. + */ + *no_way_out = atomic_read(&global_nwo); + + return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static noinstr int mce_end(int order) +{ + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU unable to finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU did not finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + ret = 0; + goto out; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + + return ret; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } +} + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static noinstr bool mce_check_crashing_cpu(void) +{ + unsigned int cpu = smp_processor_id(); + + if (arch_cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + + if (mcgstatus & MCG_STATUS_RIPV) { + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, regs, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + +static void kill_me_now(struct callback_head *ch) +{ + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + p->mce_count = 0; + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + + if (!p->mce_ripv) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && + !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + sync_core(); + return; + } + + if (p->mce_vaddr != (void __user *)-1l) { + force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); + } else { + pr_err("Memory error not recovered"); + kill_me_now(cb); + } +} + +static void queue_task_work(struct mce *m, char *msg, int kill_current_task) +{ + int count = ++current->mce_count; + + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_current_task) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; + + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + * + * Tracing and kprobes are disabled: if we interrupted a kernel context + * with IF=1, we need to minimize stack usage. There are also recursion + * issues: if the machine check was due to a failure of the memory + * backing the user stack, tracing that reads the user stack will cause + * potentially infinite recursion. + */ +noinstr void do_machine_check(struct pt_regs *regs) +{ + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + struct mca_config *cfg = &mca_cfg; + struct mce m, *final; + char *msg = NULL; + int worst = 0; + + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order = -1; + + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + + /* + * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES + * on Intel. + */ + int lmce = 1; + + this_cpu_inc(mce_exception_count); + + mce_gather_info(&m, regs); + m.tsc = rdtsc(); + + final = this_cpu_ptr(&mces_seen); + *final = m; + + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + + barrier(); + + /* + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + /* + * Check if this MCE is signaled to only this logical processor, + * on Intel, Zhaoxin only. + */ + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* + * Local machine check may already know that we have to panic. + * Broadcast machine check begins rendezvous in mce_start() + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one + * to see it will clear it. + */ + if (lmce) { + if (no_way_out) + mce_panic("Fatal local machine check", &m, msg); + } else { + order = mce_start(&no_way_out); + } + + __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + + if (!no_way_out) + mce_clear_state(toclear); + + /* + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. + */ + if (!lmce) { + if (mce_end(order) < 0) { + if (!no_way_out) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } + } else { + /* + * If there was a fatal machine check we should have + * already called mce_panic earlier in this function. + * Since we re-read the banks, we might have found + * something new. Check again to see if we found a + * fatal error. We call "mce_severity()" again to + * make sure we have the right "msg". + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + mce_severity(&m, regs, cfg->tolerant, &msg, true); + mce_panic("Local fatal machine check!", &m, msg); + } + } + + /* + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. + */ + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + + if (worst > 0) + irq_work_queue(&mce_irq_work); + + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out; + + /* + * Enable instrumentation around the external facilities like + * task_work_add() (via queue_task_work()), fixup_exception() etc. + * For now, that is. Fixing this properly would need a lot more involved + * reorganization. + */ + instrumentation_begin(); + + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + queue_task_work(&m, msg, kill_it); + + } else { + /* + * Handle an MCE which has happened in kernel space but from + * which the kernel can recover: ex_has_fault_handler() has + * already verified that the rIP at which the error happened is + * a rIP from which the kernel can recover (by jumping to + * recovery code specified in _ASM_EXTABLE_FAULT()) and the + * corresponding exception handler which would do that is the + * proper one. + */ + if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) + mce_panic("Failed kernel mode recovery", &m, msg); + } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, msg, kill_it); + } + + instrumentation_end(); + +out: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int flags) +{ + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; +} +#endif + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static unsigned long check_interval = INITIAL_CHECK_INTERVAL; + +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; + +static void __start_timer(struct timer_list *t, unsigned long interval) +{ + unsigned long when = jiffies + interval; + unsigned long flags; + + local_irq_save(flags); + + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); + + local_irq_restore(flags); +} + +static void mce_timer_fn(struct timer_list *t) +{ + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); + unsigned long iv; + + WARN_ON(cpu_t != t); + + iv = __this_cpu_read(mce_next_interval); + + if (mce_available(this_cpu_ptr(&cpu_info))) { + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + + if (mce_intel_cmci_poll()) { + iv = mce_adjust_timer(iv); + goto done; + } + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the polling + * interval, otherwise increase the polling interval. + */ + if (mce_notify_irq()) + iv = max(iv / 2, (unsigned long) HZ/100); + else + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + +done: + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned long iv = __this_cpu_read(mce_next_interval); + + __start_timer(t, interval); + + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +static void __mcheck_cpu_mce_banks_init(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u8 n_banks = this_cpu_read(mce_num_banks); + int i; + + for (i = 0; i < n_banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + /* + * Init them all, __mcheck_cpu_apply_quirks() is going to apply + * the required vendor quirks before + * __mcheck_cpu_init_clear_banks() does the final bank setup. + */ + b->ctl = -1ULL; + b->init = 1; + } +} + +/* + * Initialize Machine Checks for a CPU. + */ +static void __mcheck_cpu_cap_init(void) +{ + u64 cap; + u8 b; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + b = cap & MCG_BANKCNT_MASK; + + if (b > MAX_NR_BANKS) { + pr_warn("CPU%d: Using only %u machine check banks out of %u\n", + smp_processor_id(), MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + this_cpu_write(mce_num_banks, b); + + __mcheck_cpu_mce_banks_init(); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; +} + +static void __mcheck_cpu_init_generic(void) +{ + enum mcp_flags m_fl = 0; + mce_banks_t all_banks; + u64 cap; + + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); + + cr4_set_bits(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); +} + +static void __mcheck_cpu_init_clear_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + } +} + +/* + * Do a final check to see if there are any unused/RAZ banks. + * + * This must be done after the banks have been initialized and any quirks have + * been applied. + * + * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. + * Otherwise, a user who disables a bank will not be able to re-enable it + * without a system reboot. + */ +static void __mcheck_cpu_check_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + b->init = !!msrval; + } +} + +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* Add per CPU specific workarounds here */ +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + struct mca_config *cfg = &mca_cfg; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("unknown CPU type - not enabling MCE support\n"); + return -EOPNOTSUPP; + } + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && cfg->bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + cfg->bootlog = 0; + } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].ctl = 0; + + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + } + + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) + mce_banks[0].init = 0; + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; + } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; + + return 0; +} + +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return 0; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + intel_p5_mcheck_init(c); + return 1; + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + return 1; + break; + default: + return 0; + } + + return 0; +} + +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); + mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; + } +} + +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + /* + * All newer Centaur CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || + c->x86 > 6) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } +} + +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); + break; + } + + case X86_VENDOR_HYGON: + mce_hygon_feature_init(c); + break; + + case X86_VENDOR_CENTAUR: + mce_centaur_feature_init(c); + break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + + default: + break; + } +} + +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + + default: + break; + } +} + +static void mce_start_timer(struct timer_list *t) +{ + unsigned long iv = check_interval * HZ; + + if (mca_cfg.ignore_ce || !iv) + return; + + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); + mce_start_timer(t); +} + +bool filter_mce(struct mce *m) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return amd_filter_mce(m); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return intel_filter_mce(m); + + return false; +} + +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; + +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) +{ + irqentry_state_t irq_state; + + WARN_ON_ONCE(user_mode(regs)); + + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ + if (machine_check_vector == do_machine_check && + mce_check_crashing_cpu()) + return; + + irq_state = irqentry_nmi_enter(regs); + /* + * The call targets are marked noinstr, but objtool can't figure + * that out because it's an indirect call. Annotate it. + */ + instrumentation_begin(); + trace_hardirqs_off_finish(); + machine_check_vector(regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_user(regs); + local_db_restore(dr7); +} +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_RAW(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void mcheck_cpu_init(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (__mcheck_cpu_ancient_init(c)) + return; + + if (!mce_available(c)) + return; + + __mcheck_cpu_cap_init(); + + if (__mcheck_cpu_apply_quirks(c) < 0) { + mca_cfg.disabled = 1; + return; + } + + if (mce_gen_pool_init()) { + mca_cfg.disabled = 1; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + + machine_check_vector = do_machine_check; + + __mcheck_cpu_init_early(c); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_check_banks(); + __mcheck_cpu_setup_timer(); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + +} + +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= this_cpu_read(mce_num_banks)) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=print_all Print all machine check logs to console + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. + * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable copy_mc_fragile() + */ +static int __init mcheck_enable(char *str) +{ + struct mca_config *cfg = &mca_cfg; + + if (*str == 0) { + enable_p5_mce(); + return 1; + } + if (*str == '=') + str++; + if (!strcmp(str, "off")) + cfg->disabled = 1; + else if (!strcmp(str, "no_cmci")) + cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + cfg->dont_log_ce = true; + else if (!strcmp(str, "print_all")) + cfg->print_all = true; + else if (!strcmp(str, "ignore_ce")) + cfg->ignore_ce = true; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = 1; + else if (!strcmp(str, "recovery")) + cfg->recovery = 1; + else if (isdigit(str[0])) { + if (get_option(&str, &cfg->tolerant) == 2) + get_option(&str, &(cfg->monarch_timeout)); + } else { + pr_info("mce argument %s ignored. Please use /sys\n", str); + return 0; + } + return 1; +} +__setup("mce", mcheck_enable); + +int __init mcheck_init(void) +{ + mcheck_intel_therm_init(); + mce_register_decode_chain(&early_nb); + mce_register_decode_chain(&mce_uc_nb); + mce_register_decode_chain(&mce_default_nb); + mcheck_vendor_init_severity(); + + INIT_WORK(&mce_work, mce_gen_pool_process); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + + return 0; +} + +/* + * mce_syscore: PM support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static void mce_disable_error_reporting(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + } + return; +} + +static void vendor_disable_error_reporting(void) +{ + /* + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) + return; + + mce_disable_error_reporting(); +} + +static int mce_syscore_suspend(void) +{ + vendor_disable_error_reporting(); + return 0; +} + +static void mce_syscore_shutdown(void) +{ + vendor_disable_error_reporting(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static void mce_syscore_resume(void) +{ + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_clear_banks(); +} + +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + +static void mce_cpu_restart(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + mce_timer_delete_all(); + on_each_cpu(mce_cpu_restart, NULL, 1); + mce_schedule_work(); +} + +/* Toggle features for corrected errors */ +static void mce_disable_cmci(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + __mcheck_cpu_init_timer(); +} + +static struct bus_type mce_subsys = { + .name = "machinecheck", + .dev_name = "machinecheck", +}; + +DEFINE_PER_CPU(struct device *, mce_device); + +static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) +{ + return container_of(attr, struct mce_bank_dev, attr); +} + +static ssize_t show_bank(struct device *s, struct device_attribute *attr, + char *buf) +{ + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + return sprintf(buf, "%llx\n", b->ctl); +} + +static ssize_t set_bank(struct device *s, struct device_attribute *attr, + const char *buf, size_t size) +{ + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + b->ctl = new; + mce_restart(); + + return size; +} + +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; + } else { + /* enable ce features */ + mca_cfg.ignore_ce = false; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; + } else { + /* enable cmci */ + mca_cfg.cmci_disabled = false; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long old_check_interval = check_interval; + ssize_t ret = device_store_ulong(s, attr, buf, size); + + if (check_interval == old_check_interval) + return ret; + + mutex_lock(&mce_sysfs_mutex); + mce_restart(); + mutex_unlock(&mce_sysfs_mutex); + + return ret; +} + +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); +static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); + +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), + &check_interval +}; + +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce +}; + +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled +}; + +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY + &dev_attr_trigger, +#endif + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_print_all.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + +/* Per CPU device init. All of the CPUs still share the same bank device: */ +static int mce_device_create(unsigned int cpu) +{ + struct device *dev; + int err; + int i, j; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; + + err = device_register(dev); + if (err) { + put_device(dev); + return err; + } + + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); + if (err) + goto error; + } + for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) { + err = device_create_file(dev, &mce_bank_devs[j].attr); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; + + return 0; +error2: + while (--j >= 0) + device_remove_file(dev, &mce_bank_devs[j].attr); +error: + while (--i >= 0) + device_remove_file(dev, mce_device_attrs[i]); + + device_unregister(dev); + + return err; +} + +static void mce_device_remove(unsigned int cpu) +{ + struct device *dev = per_cpu(mce_device, cpu); + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); + + for (i = 0; i < per_cpu(mce_num_banks, cpu); i++) + device_remove_file(dev, &mce_bank_devs[i].attr); + + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_clear(); + + vendor_disable_error_reporting(); +} + +static void mce_reenable_cpu(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + int i; + + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_reenable(); + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + } +} + +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + int ret; + + mce_device_create(cpu); + + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; + } + mce_reenable_cpu(); + mce_start_timer(t); + return 0; +} + +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} + +static __init void mce_init_banks(void) +{ + int i; + + for (i = 0; i < MAX_NR_BANKS; i++) { + struct mce_bank_dev *b = &mce_bank_devs[i]; + struct device_attribute *a = &b->attr; + + b->bank = i; + + sysfs_attr_init(&a->attr); + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } +} + +/* + * When running on XEN, this initcall is ordered against the XEN mcelog + * initcall: + * + * device_initcall(xen_late_init_mcelog); + * device_initcall_sync(mcheck_init_device); + */ +static __init int mcheck_init_device(void) +{ + int err; + + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } + + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } + + mce_init_banks(); + + err = subsys_system_register(&mce_subsys, NULL); + if (err) + goto err_out_mem; + + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; + + /* + * Invokes mce_cpu_online() on all CPUs which are online when + * the state is installed. + */ + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + + register_syscore_ops(&mce_syscore_ops); + + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init MCE device (rc: %d)\n", err); + + return err; +} +device_initcall_sync(mcheck_init_device); + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mca_cfg.disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_panicked, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); + +static void __init mcheck_debugfs_init(void) +{ + struct dentry *dmce; + + dmce = mce_get_debugfs_dir(); + debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); +} +#else +static void __init mcheck_debugfs_init(void) { } +#endif + +static int __init mcheck_late_init(void) +{ + if (mca_cfg.recovery) + enable_copy_mc_fragile(); + + mcheck_debugfs_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c new file mode 100644 index 000000000..100fbeebd --- /dev/null +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include + +#include "internal.h" + +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer *mcelog; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int entry; + + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + + mutex_lock(&mce_chrdev_read_mutex); + + entry = mcelog->next; + + /* + * When the buffer fills up discard new entries. Assume that the + * earlier errors are the more interesting ones: + */ + if (entry >= mcelog->len) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags); + goto unlock; + } + + mcelog->next = entry + 1; + + memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); + mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + +unlock: + mutex_unlock(&mce_chrdev_read_mutex); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned next; + int i, err; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < mcelog->len * sizeof(struct mce)) + goto out; + + next = mcelog->next; + err = 0; + + for (i = 0; i < next; i++) { + struct mce *m = &mcelog->entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(mcelog->entry, 0, next * sizeof(struct mce)); + mcelog->next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + + return err ? err : buf - ubuf; +} + +static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog->next)) + return EPOLLIN | EPOLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(mcelog->len, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog->flags; + } while (cmpxchg(&mcelog->flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); + +void mce_unregister_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int mce_log_len; + int err; + + mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); + if (!mcelog) + return -ENOMEM; + + memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + mcelog->len = mce_log_len; + mcelog->recordlen = sizeof(struct mce); + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + + kfree(mcelog); + return err; + } + + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c new file mode 100644 index 000000000..fbe8b61c3 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong + */ +#include +#include +#include +#include +#include "internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +/* + * Compare the record "t" with each of the records on list "l" to see if + * an equivalent one is present in the list. + */ +static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) +{ + struct mce_evt_llist *node; + struct mce *m1, *m2; + + m1 = &t->mce; + + llist_for_each_entry(node, &l->llnode, llnode) { + m2 = &node->mce; + + if (!mce_cmp(m1, m2)) + return true; + } + return false; +} + +/* + * The system has panicked - we'd like to peruse the list of MCE records + * that have been queued, but not seen by anyone yet. The list is in + * reverse time order, so we need to reverse it. While doing that we can + * also drop duplicate records (these were logged because some banks are + * shared between cores or by all threads on a socket). + */ +struct llist_node *mce_gen_pool_prepare_records(void) +{ + struct llist_node *head; + LLIST_HEAD(new_head); + struct mce_evt_llist *node, *t; + + head = llist_del_all(&mce_event_llist); + if (!head) + return NULL; + + /* squeeze out duplicates while reversing order */ + llist_for_each_entry_safe(node, t, head, llnode) { + if (!is_duplicate_mce_record(node, t)) + llist_add(&node->llnode, &new_head); + } + + return new_head.first; +} + +void mce_gen_pool_process(struct work_struct *__unused) +{ + struct llist_node *head; + struct mce_evt_llist *node, *tmp; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry_safe(node, tmp, head, llnode) { + mce = &node->mce; + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (filter_mce(mce)) + return -EINVAL; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c new file mode 100644 index 000000000..e7808309d --- /dev/null +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -0,0 +1,705 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * Authors: + * Andi Kleen + * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * Copyright (c) 2010-17: Borislav Petkov + * Advanced Micro Devices Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +#define MAX_FLAG_OPT_SIZE 4 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; +} + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->extcpu); + + /* Make sure no one reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->extcpu = m->extcpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* do_machine_check() expects interrupts disabled -- at least */ + local_irq_save(flags); + do_machine_check(pregs); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); + +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; + cpumask_clear_cpu(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, regs); + else if (m->status) + raise_poll(m); + return NMI_HANDLED; +} + +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = this_cpu_ptr(&injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { + pr_info("Triggering MCE exception on CPU %d\n", cpu); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + fallthrough; + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + pr_info("Invalid MCE context\n"); + ret = -EINVAL; + } + pr_info("MCE exception done on CPU %d\n", cpu); + } else if (m->status) { + pr_info("Starting machine check poll CPU %d\n", cpu); + raise_poll(m); + mce_notify_irq(); + pr_info("Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void __maybe_unused raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { + unsigned long start; + int cpu; + + get_online_cpus(); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpumask_clear_cpu(cpu, mce_inject_cpumask); + } + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } + start = jiffies; + while (!cpumask_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + pr_err("Timeout waiting for mce inject %lx\n", + *cpumask_bits(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else { + preempt_disable(); + raise_local(); + preempt_enable(); + } +} + +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + i_mce.tsc = rdtsc_ordered(); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status &= ~MCI_STATUS_UC; + } + + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; + + if (val >= n_banks) { + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + /* Reset injection struct */ + setup_inj_struct(&i_mce); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static void __init debugfs_init(void) +{ + unsigned int i; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj, + &i_mce, dfs_fls[i].fops); +} + +static int __init inject_init(void) +{ + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; + + debugfs_init(); + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + + pr_info("Machine check injector initialized\n"); + + return 0; +} + +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + +module_init(inject_init); +module_exit(inject_exit); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c new file mode 100644 index 000000000..886d4648c --- /dev/null +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * CMCI storm detection backoff counter + * + * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've + * encountered an error. If not, we decrement it by one. We signal the end of + * the CMCI storm when it reaches 0. + */ +static DEFINE_PER_CPU(int, cmci_backoff_cnt); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return 0; + + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP + * fault. The MSR must also be locked for LMCE_ENABLED to take effect. + * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally + * locks the MSR in the event that it wasn't already locked by BIOS. + */ + rdmsrl(MSR_IA32_FEAT_CTL, tmp); + if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) + return false; + + return tmp & FEAT_CTL_LMCE_ENABLED; +} + +bool mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return false; + + /* + * Reset the counter if we've logged an error in the last poll + * during the storm. + */ + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + else + this_cpu_dec(cmci_backoff_cnt); + + return true; +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +static void cmci_toggle_interrupt_mode(bool on) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = this_cpu_ptr(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + + if (on) + val |= MCI_CTL2_CMCI_EN; + else + val &= ~MCI_CTL2_CMCI_EN; + + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +unsigned long cmci_intel_adjust_timer(unsigned long interval) +{ + if ((this_cpu_read(cmci_backoff_cnt) > 0) && + (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { + mce_notify_irq(); + return CMCI_STORM_INTERVAL; + } + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the timer + * interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + if (!atomic_sub_return(1, &cmci_storm_on_cpus)) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + + fallthrough; + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all CPUs to go back to SUBSIDED state. When that + * happens we switch back to interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_toggle_interrupt_mode(true); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + + /* We have shiny weather. Let the poll do whatever it thinks. */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_toggle_interrupt_mode(false); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_STORM_INTERVAL); + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + if (cmci_storm_detect()) + return; + + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks) +{ + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + unsigned long flags; + int i; + int bios_wrong_thresh = 0; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + int bios_zero_thresh = 0; + + if (test_bit(i, owned)) + continue; + + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Already owned by someone else? */ + if (val & MCI_CTL2_CMCI_EN) { + clear_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + continue; + } + + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & MCI_CTL2_CMCI_EN) { + set_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; + } else { + WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); + } + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) + return; + + local_irq_save(flags); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); + local_irq_restore(flags); +} + +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + + if (!cmci_supported(&banks)) + return; + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + on_each_cpu(cmci_rediscover_work_func, NULL, 1); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + +void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: + case INTEL_FAM6_SAPPHIRERAPIDS_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN locked in disabled mode */ + return; + } + + /* If PPIN is disabled, try to enable */ + if (!(val & 2UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + /* Is the enable bit set? */ + if (val & 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); + intel_init_cmci(); + intel_init_lmce(); + intel_ppin_init(c); +} + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + +bool intel_filter_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ + if ((c->x86 == 6) && + ((c->x86_model == INTEL_FAM6_HASWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_L) || + (c->x86_model == INTEL_FAM6_BROADWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_G) || + (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + (m->bank == 0) && + ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h new file mode 100644 index 000000000..3a485c0d5 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + +#undef pr_fmt +#define pr_fmt(fmt) "mce: " fmt + +#include +#include + +/* Pointer to the installed machine check handler for this CPU setup. */ +extern void (*machine_check_vector)(struct pt_regs *); + +enum severity_level { + MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, + MCE_KEEP_SEVERITY, + MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, + MCE_UC_SEVERITY, + MCE_AR_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +extern struct blocking_notifier_head x86_mce_decoder_chain; + +#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ + +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(struct work_struct *__unused); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); +struct llist_node *mce_gen_pool_prepare_records(void); + +extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp); +struct dentry *mce_get_debugfs_dir(void); + +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long cmci_intel_adjust_timer(unsigned long interval); +bool mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); +bool intel_filter_mce(struct mce *m); +#else +# define cmci_intel_adjust_timer mce_adjust_timer_default +static inline bool mce_intel_cmci_poll(void) { return false; } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } +static inline bool intel_filter_mce(struct mce *m) { return false; }; +#endif + +void mce_timer_kick(unsigned long interval); + +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif + +/* + * We consider records to be equivalent if bank+status+addr+misc all match. + * This is only used when the system is going down because of a fatal error + * to avoid cluttering the console log with essentially repeated information. + * In normal processing all errors seen are logged. + */ +static inline bool mce_cmp(struct mce *m1, struct mce *m2) +{ + return m1->bank != m2->bank || + m1->status != m2->status || + m1->addr != m2->addr || + m1->misc != m2->misc; +} + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); +#else +static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } +#endif + +struct mca_config { + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; + + __u64 lmce_disabled : 1, + disabled : 1, + ser : 1, + recovery : 1, + bios_cmci_threshold : 1, + __reserved : 59; + + s8 bootlog; + int tolerant; + int monarch_timeout; + int panic_timeout; + u32 rip_msr; +}; + +extern struct mca_config mca_cfg; +DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +struct mce_vendor_flags { + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + /* AMD-style error thresholding banks present. */ + amd_threshold : 1, + + __reserved_0 : 60; +}; + +extern struct mce_vendor_flags mce_flags; + +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, +}; + +u32 mca_msr_reg(int bank, enum mca_msr reg); + +/* Decide whether to add MCE record to MCE event pool or filter it out. */ +extern bool filter_mce(struct mce *m); + +#ifdef CONFIG_X86_MCE_AMD +extern bool amd_filter_mce(struct mce *m); +#else +static inline bool amd_filter_mce(struct mce *m) { return false; }; +#endif + +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c new file mode 100644 index 000000000..19e90cae8 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* By default disabled */ +int mce_p5_enabled __read_mostly; + +/* Machine check handler for Pentium class Intel CPUs: */ +static noinstr void pentium_machine_check(struct pt_regs *regs) +{ + u32 loaddr, hi, lotype; + + instrumentation_begin(); + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + + pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); +} + +/* Set up machine check reporting for processors with Intel style MCE: */ +void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) + return; + + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + /* Read registers before enabling: */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + pr_info("Intel old style machine check architecture supported.\n"); + + /* Enable MCE: */ + cr4_set_bits(X86_CR4_MCE); + pr_info("Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c new file mode 100644 index 000000000..83df99131 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * Author: Andi Kleen + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + unsigned char ser; + unsigned char context; + unsigned char excp; + unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; + char *msg; +} severities[] = { +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) + + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + EXCP, BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), + /* When MCIP is not set something is very confused */ + MCESEV( + PANIC, "MCIP not set in MCA handler", + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) + ), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCESEV( + PANIC, "Neither restart nor error IP", + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), + /* + * known AO MCACODs reported via MCE or CMC: + * + * SRAO could be signaled either via a machine check exception or + * CMCI with the corresponding bit S 1 or 0. So we don't need to + * check bit S for SRAO. + */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), + + /* ignore OVER for UCNA */ + MCESEV( + UCNA, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), + MCESEV( + PANIC, "Data load in unrecoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL + ), + MCESEV( + PANIC, "Instruction fetch error in kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + KERNEL + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), + + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ +}; + +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + unsigned long addr; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_opcode(&insn); + if (!insn.opcode.got) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + insn_get_modrm(&insn); + insn_get_sib(&insn); + if (!insn.modrm.got || !insn.sib.got) + return false; + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + +/* + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. + */ +static int error_context(struct mce *m, struct pt_regs *regs) +{ + enum handler_type t; + + if ((m->cs & 3) == 3) + return IN_USER; + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; + + t = ex_get_fault_handler_type(m->ip); + if (t == EX_HANDLER_FAULT) { + m->kflags |= MCE_IN_KERNEL_RECOV; + return IN_KERNEL_RECOV; + } + if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { + m->kflags |= MCE_IN_KERNEL_RECOV; + m->kflags |= MCE_IN_KERNEL_COPYIN; + return IN_KERNEL_RECOV; + } + + return IN_KERNEL; +} + +static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +{ + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + u32 low, high; + + /* + * We need to look at the following bits: + * - "succor" bit (data poisoning support), and + * - TCC bit (Task Context Corrupt) + * in MCi_STATUS to determine error severity. + */ + if (!mce_flags.succor) + return MCE_PANIC_SEVERITY; + + if (rdmsr_safe(addr, &low, &high)) + return MCE_PANIC_SEVERITY; + + /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ + if ((low & MCI_CONFIG_MCAX) && + (m->status & MCI_STATUS_TCC) && + (err_ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* ...otherwise invoke hwpoison handler. */ + return MCE_AR_SEVERITY; +} + +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) +{ + enum context ctx = error_context(m, regs); + + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) + return MCE_PANIC_SEVERITY; + + if (m->status & MCI_STATUS_UC) { + + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + + /* + * On older systems where overflow_recov flag is not present, we + * should simply panic if an error overflow occurs. If + * overflow_recov flag is present and set, then software can try + * to at least kill process to prolong system operation. + */ + if (mce_flags.overflow_recov) { + if (mce_flags.smca) + return mce_severity_amd_smca(m, ctx); + + /* kill current process */ + return MCE_AR_SEVERITY; + } else { + /* at least one error was not logged */ + if (m->status & MCI_STATUS_OVER) + return MCE_PANIC_SEVERITY; + } + + /* + * For any other case, return MCE_UC_SEVERITY so that we log the + * error and exit #MC handler. + */ + return MCE_UC_SEVERITY; + } + + /* + * deferred error: poll handler catches these and adds to mce_ring so + * memory-failure can take recovery actions. + */ + if (m->status & MCI_STATUS_DEFERRED) + return MCE_DEFERRED_SEVERITY; + + /* + * corrected error: poll handler catches these and passes responsibility + * of decoding the error to EDAC + */ + return MCE_KEEP_SEVERITY; +} + +static int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) +{ + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); + enum context ctx = error_context(m, regs); + struct severity *s; + + for (s = severities;; s++) { + if ((m->status & s->mask) != s->result) + continue; + if ((m->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->ser == SER_REQUIRED && !mca_cfg.ser) + continue; + if (s->ser == NO_SER && mca_cfg.ser) + continue; + if (s->context && ctx != s->context) + continue; + if (s->excp && excp != s->excp) + continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; + if (msg) + *msg = s->msg; + s->covered = 1; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (tolerant < 1) + return MCE_PANIC_SEVERITY; + } + return s->sev; + } +} + +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = + mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + mce_severity = mce_severity_amd; +} + +#ifdef CONFIG_DEBUG_FS +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, + .llseek = seq_lseek, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce; + + dmce = mce_get_debugfs_dir(); + + debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + return 0; +} +late_initcall(severities_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c new file mode 100644 index 000000000..a7cd2d203 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -0,0 +1,739 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + +/** + * struct _thermal_state - Represent the current thermal event state + * @next_check: Stores the next timestamp, when it is allowed + * to log the next warning message. + * @last_interrupt_time: Stores the timestamp for the last threshold + * high event. + * @therm_work: Delayed workqueue structure + * @count: Stores the current running count for thermal + * or power threshold interrupts. + * @last_count: Stores the previous running count for thermal + * or power threshold interrupts. + * @max_time_ms: This shows the maximum amount of time CPU was + * in throttled state for a single thermal + * threshold high to low state. + * @total_time_ms: This is a cumulative time during which CPU was + * in the throttled state. + * @rate_control_active: Set when a throttling message is logged. + * This is used for the purpose of rate-control. + * @new_event: Stores the last high/low status of the + * THERM_STATUS_PROCHOT or + * THERM_STATUS_POWER_LIMIT. + * @level: Stores whether this _thermal_state instance is + * for a CORE level or for PACKAGE level. + * @sample_index: Index for storing the next sample in the buffer + * temp_samples[]. + * @sample_count: Total number of samples collected in the buffer + * temp_samples[]. + * @average: The last moving average of temperature samples + * @baseline_temp: Temperature at which thermal threshold high + * interrupt was generated. + * @temp_samples: Storage for temperature samples to calculate + * moving average. + * + * This structure is used to represent data related to thermal state for a CPU. + * There is a separate storage for core and package level for each CPU. + */ +struct _thermal_state { + u64 next_check; + u64 last_interrupt_time; + struct delayed_work therm_work; + unsigned long count; + unsigned long last_count; + unsigned long max_time_ms; + unsigned long total_time_ms; + bool rate_control_active; + bool new_event; + u8 level; + u8 sample_index; + u8 sample_count; + u8 average; + u8 baseline_temp; + u8 temp_samples[3]; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; +}; + +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); + +static u32 lvtthmr_init __read_mostly; + +#ifdef CONFIG_SYSFS +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ + NULL) \ + +#define define_therm_throt_device_show_func(event, name) \ + \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) { \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); + +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); + +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); + +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); + +define_therm_throt_device_show_func(core_throttle, max_time_ms); +define_therm_throt_device_one_ro(core_throttle_max_time_ms); + +define_therm_throt_device_show_func(package_throttle, max_time_ms); +define_therm_throt_device_one_ro(package_throttle_max_time_ms); + +define_therm_throt_device_show_func(core_throttle, total_time_ms); +define_therm_throt_device_one_ro(core_throttle_total_time_ms); + +define_therm_throt_device_show_func(package_throttle, total_time_ms); +define_therm_throt_device_one_ro(package_throttle_total_time_ms); + +static struct attribute *thermal_throttle_attrs[] = { + &dev_attr_core_throttle_count.attr, + &dev_attr_core_throttle_max_time_ms.attr, + &dev_attr_core_throttle_total_time_ms.attr, + NULL +}; + +static const struct attribute_group thermal_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + +#define THERM_THROT_POLL_INTERVAL HZ +#define THERM_STATUS_PROCHOT_LOG BIT(1) + +#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) + +static void clear_therm_status_log(int level) +{ + int msr; + u64 mask, msr_val; + + if (level == CORE_LEVEL) { + msr = MSR_IA32_THERM_STATUS; + mask = THERM_STATUS_CLEAR_CORE_MASK; + } else { + msr = MSR_IA32_PACKAGE_THERM_STATUS; + mask = THERM_STATUS_CLEAR_PKG_MASK; + } + + rdmsrl(msr, msr_val); + msr_val &= mask; + wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); +} + +static void get_therm_status(int level, bool *proc_hot, u8 *temp) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + if (msr_val & THERM_STATUS_PROCHOT_LOG) + *proc_hot = true; + else + *proc_hot = false; + + *temp = (msr_val >> 16) & 0x7F; +} + +static void __maybe_unused throttle_active_work(struct work_struct *work) +{ + struct _thermal_state *state = container_of(to_delayed_work(work), + struct _thermal_state, therm_work); + unsigned int i, avg, this_cpu = smp_processor_id(); + u64 now = get_jiffies_64(); + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* temperature value is offset from the max so lesser means hotter */ + if (!hot && temp > state->baseline_temp) { + if (state->rate_control_active) + pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + + state->rate_control_active = false; + return; + } + + if (time_before64(now, state->next_check) && + state->rate_control_active) + goto re_arm; + + state->next_check = now + CHECK_INTERVAL; + + if (state->count != state->last_count) { + /* There was one new thermal interrupt */ + state->last_count = state->count; + state->average = 0; + state->sample_count = 0; + state->sample_index = 0; + } + + state->temp_samples[state->sample_index] = temp; + state->sample_count++; + state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); + if (state->sample_count < ARRAY_SIZE(state->temp_samples)) + goto re_arm; + + avg = 0; + for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) + avg += state->temp_samples[i]; + + avg /= ARRAY_SIZE(state->temp_samples); + + if (state->average > avg) { + pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + state->rate_control_active = true; + } + + state->average = avg; + +re_arm: + clear_therm_status_log(state->level); + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); +} + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + */ +static void therm_throt_process(bool new_event, int event, int level) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; + u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + + now = get_jiffies_64(); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return; + } else + return; + + old_event = state->new_event; + state->new_event = new_event; + + if (new_event) + state->count++; + + if (event != THERMAL_THROTTLING_EVENT) + return; + + if (new_event && !state->last_interrupt_time) { + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* + * Ignore short temperature spike as the system is not close + * to PROCHOT. 10C offset is large enough to ignore. It is + * already dropped from the high threshold temperature. + */ + if (temp > 10) + return; + + state->baseline_temp = temp; + state->last_interrupt_time = now; + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); + } else if (old_event && state->last_interrupt_time) { + unsigned long throttle_time; + + throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); + if (throttle_time > state->max_time_ms) + state->max_time_ms = throttle_time; + state->total_time_ms += throttle_time; + state->last_interrupt_time = 0; + } +} + +static int thresh_event_valid(int level, int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device: */ +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) +{ + int err; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (err) + goto del_group; + } + + if (cpu_has(c, X86_FEATURE_PTS)) { + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_max_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_total_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, + thermal_attr_group.name); + if (err) + goto del_group; + } + } + + return 0; + +del_group: + sysfs_remove_group(&dev->kobj, &thermal_attr_group); + + return err; +} + +static void thermal_throttle_remove_dev(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &thermal_attr_group); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int thermal_throttle_online(unsigned int cpu) +{ + struct thermal_state *state = &per_cpu(thermal_state, cpu); + struct device *dev = get_cpu_device(cpu); + u32 l; + + state->package_throttle.level = PACKAGE_LEVEL; + state->core_throttle.level = CORE_LEVEL; + + INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); + INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + + /* Unmask the thermal vector after the above workqueues are initialized. */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + return thermal_throttle_add_dev(dev, cpu); +} + +static int thermal_throttle_offline(unsigned int cpu) +{ + struct thermal_state *state = &per_cpu(thermal_state, cpu); + struct device *dev = get_cpu_device(cpu); + u32 l; + + /* Mask the thermal vector before draining evtl. pending work */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); + + cancel_delayed_work_sync(&state->package_throttle.therm_work); + cancel_delayed_work_sync(&state->core_throttle.therm_work); + + state->package_throttle.rate_control_active = false; + state->core_throttle.rate_control_active = false; + + thermal_throttle_remove_dev(dev); + return 0; +} + +static __init int thermal_throttle_init_device(void) +{ + int ret; + + if (!atomic_read(&therm_throt_en)) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; +} +device_initcall(thermal_throttle_init_device); + +#endif /* CONFIG_SYSFS */ + +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) + platform_thermal_notify(msr_val); +} + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + if (static_cpu_has(X86_FEATURE_HWP)) + wrmsrl_safe(MSR_HWP_STATUS, 0); + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL); + } +} + +static void unexpected_thermal_interrupt(void) +{ + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} + +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!boot_cpu_has(X86_FEATURE_APIC)) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + if (!intel_thermal_supported(c)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + h = lvtthmr_init; + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. + */ + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); + + + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + if (system_state == SYSTEM_BOOTING) + pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + + smp_thermal_vector = intel_thermal_interrupt; + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c new file mode 100644 index 000000000..6a059a035 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common corrected MCE threshold handler code: + */ +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +static void default_threshold_interrupt(void) +{ + pr_err("Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) +{ + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + ack_APIC_irq(); +} diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c new file mode 100644 index 000000000..9c9f0abd2 --- /dev/null +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Machine check handler for WinChip C6: */ +static noinstr void winchip_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_emerg("CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + rdmsr(MSR_IDT_FCR1, lo, hi); + lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo &= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + + cr4_set_bits(X86_CR4_MCE); + + pr_info("Winchip machine check reporting enabled on CPU#0.\n"); +} diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 000000000..34098d48c --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +microcode-y := core.o +obj-$(CONFIG_MICROCODE) += microcode.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o +microcode-$(CONFIG_MICROCODE_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 000000000..936085d81 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,974 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD CPU Microcode Update Driver for Linux + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2018 Borislav Petkov + * + * Author: Peter Oruba + * + * Based on work by: + * Tigran Aivazian + * + * early loader: + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin + * Fixes: Borislav Petkov + */ +#define pr_fmt(fmt) "microcode: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct equiv_cpu_table { + unsigned int num_entries; + struct equiv_cpu_entry *entry; +} equiv_table; + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd/builtin before jettisoning its contents. @mc is the + * microcode patch we found to match. + */ +struct cont_desc { + struct microcode_amd *mc; + u32 cpuid_1_eax; + u32 psize; + u8 *data; + size_t size; +}; + +static u32 ucode_new_rev; + +/* One blob per node. */ +static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; + +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/microcode.rst + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; + +static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) +{ + unsigned int i; + + if (!et || !et->num_entries) + return 0; + + for (i = 0; i < et->num_entries; i++) { + struct equiv_cpu_entry *e = &et->entry[i]; + + if (sig == e->installed_cpu) + return e->equiv_cpu; + + e++; + } + return 0; +} + +/* + * Check whether there is a valid microcode container file at the beginning + * of @buf of size @buf_size. Set @early to use this function in the early path. + */ +static bool verify_container(const u8 *buf, size_t buf_size, bool early) +{ + u32 cont_magic; + + if (buf_size <= CONTAINER_HDR_SZ) { + if (!early) + pr_debug("Truncated microcode container header.\n"); + + return false; + } + + cont_magic = *(const u32 *)buf; + if (cont_magic != UCODE_MAGIC) { + if (!early) + pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + + return false; + } + + return true; +} + +/* + * Check whether there is a valid, non-truncated CPU equivalence table at the + * beginning of @buf of size @buf_size. Set @early to use this function in the + * early path. + */ +static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early) +{ + const u32 *hdr = (const u32 *)buf; + u32 cont_type, equiv_tbl_len; + + if (!verify_container(buf, buf_size, early)) + return false; + + cont_type = hdr[1]; + if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { + if (!early) + pr_debug("Wrong microcode container equivalence table type: %u.\n", + cont_type); + + return false; + } + + buf_size -= CONTAINER_HDR_SZ; + + equiv_tbl_len = hdr[2]; + if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || + buf_size < equiv_tbl_len) { + if (!early) + pr_debug("Truncated equivalence table.\n"); + + return false; + } + + return true; +} + +/* + * Check whether there is a valid, non-truncated microcode patch section at the + * beginning of @buf of size @buf_size. Set @early to use this function in the + * early path. + * + * On success, @sh_psize returns the patch size according to the section header, + * to the caller. + */ +static bool +__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early) +{ + u32 p_type, p_size; + const u32 *hdr; + + if (buf_size < SECTION_HDR_SIZE) { + if (!early) + pr_debug("Truncated patch section.\n"); + + return false; + } + + hdr = (const u32 *)buf; + p_type = hdr[0]; + p_size = hdr[1]; + + if (p_type != UCODE_UCODE_TYPE) { + if (!early) + pr_debug("Invalid type field (0x%x) in container file section header.\n", + p_type); + + return false; + } + + if (p_size < sizeof(struct microcode_header_amd)) { + if (!early) + pr_debug("Patch of size %u too short.\n", p_size); + + return false; + } + + *sh_psize = p_size; + + return true; +} + +/* + * Check whether the passed remaining file @buf_size is large enough to contain + * a patch of the indicated @sh_psize (and also whether this size does not + * exceed the per-family maximum). @sh_psize is the size read from the section + * header. + */ +static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +{ + u32 max_size; + + if (family >= 0x15) + return min_t(u32, sh_psize, buf_size); + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 + + switch (family) { + case 0x10 ... 0x12: + max_size = F1XH_MPB_MAX_SIZE; + break; + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + default: + WARN(1, "%s: WTF family: 0x%x\n", __func__, family); + return 0; + break; + } + + if (sh_psize > min_t(u32, buf_size, max_size)) + return 0; + + return sh_psize; +} + +/* + * Verify the patch in @buf. + * + * Returns: + * negative: on error + * positive: patch is not for this family, skip it + * 0: success + */ +static int +verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early) +{ + struct microcode_header_amd *mc_hdr; + unsigned int ret; + u32 sh_psize; + u16 proc_id; + u8 patch_fam; + + if (!__verify_patch_section(buf, buf_size, &sh_psize, early)) + return -1; + + /* + * The section header length is not included in this indicated size + * but is present in the leftover file length so we need to subtract + * it before passing this value to the function below. + */ + buf_size -= SECTION_HDR_SIZE; + + /* + * Check if the remaining buffer is big enough to contain a patch of + * size sh_psize, as the section claims. + */ + if (buf_size < sh_psize) { + if (!early) + pr_debug("Patch of size %u truncated.\n", sh_psize); + + return -1; + } + + ret = __verify_patch_size(family, sh_psize, buf_size); + if (!ret) { + if (!early) + pr_debug("Per-family patch size mismatch.\n"); + return -1; + } + + *patch_size = sh_psize; + + mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE); + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + if (!early) + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id); + return -1; + } + + proc_id = mc_hdr->processor_rev_id; + patch_fam = 0xf + (proc_id >> 12); + if (patch_fam != family) + return 1; + + return 0; +} + +/* + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. Returns the equivalence ID from the equivalence + * table or 0 if none found. + * Returns the amount of bytes consumed while scanning. @desc contains all the + * data we're going to use in later stages of the application. + */ +static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) +{ + struct equiv_cpu_table table; + size_t orig_size = size; + u32 *hdr = (u32 *)ucode; + u16 eq_id; + u8 *buf; + + if (!verify_equivalence_table(ucode, size, true)) + return 0; + + buf = ucode; + + table.entry = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ); + table.num_entries = hdr[2] / sizeof(struct equiv_cpu_entry); + + /* + * Find the equivalence ID of our CPU in this table. Even if this table + * doesn't contain a patch for the CPU, scan through the whole container + * so that it can be skipped in case there are other containers appended. + */ + eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + + buf += hdr[2] + CONTAINER_HDR_SZ; + size -= hdr[2] + CONTAINER_HDR_SZ; + + /* + * Scan through the rest of the container to find where it ends. We do + * some basic sanity-checking too. + */ + while (size > 0) { + struct microcode_amd *mc; + u32 patch_size; + int ret; + + ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true); + if (ret < 0) { + /* + * Patch verification failed, skip to the next + * container, if there's one: + */ + goto out; + } else if (ret > 0) { + goto skip; + } + + mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + if (eq_id == mc->hdr.processor_rev_id) { + desc->psize = patch_size; + desc->mc = mc; + } + +skip: + /* Skip patch section header too: */ + buf += patch_size + SECTION_HDR_SIZE; + size -= patch_size + SECTION_HDR_SIZE; + } + + /* + * If we have found a patch (desc->mc), it means we're looking at the + * container which has a patch for this CPU so return 0 to mean, @ucode + * already points to the proper container. Otherwise, we return the size + * we scanned so that we can advance to the next container in the + * buffer. + */ + if (desc->mc) { + desc->data = ucode; + desc->size = orig_size - size; + + return 0; + } + +out: + return orig_size - size; +} + +/* + * Scan the ucode blob for the proper container as we can have multiple + * containers glued together. + */ +static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) +{ + while (size) { + size_t s = parse_container(ucode, size, desc); + if (!s) + return; + + /* catch wraparound */ + if (size >= s) { + ucode += s; + size -= s; + } else { + return; + } + } +} + +static int __apply_microcode_amd(struct microcode_amd *mc) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc->hdr.patch_id) + return -1; + + return 0; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + * + * Returns true if container found (sets @desc), false otherwise. + */ +static bool +apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +{ + struct cont_desc desc = { 0 }; + u8 (*patch)[PATCH_MAX_SIZE]; + struct microcode_amd *mc; + u32 rev, dummy, *new_rev; + bool ret = false; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + patch = &amd_ucode_patch[0]; +#endif + + desc.cpuid_1_eax = cpuid_1_eax; + + scan_containers(ucode, size, &desc); + + mc = desc.mc; + if (!mc) + return ret; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) + return ret; + + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + ret = true; + + if (save_patch) + memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); + } + + return ret; +} + +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) +{ +#ifdef CONFIG_X86_64 + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (family >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", family); + + return get_builtin_firmware(cp, fw_name); +#else + return false; +#endif +} + +static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) +{ + struct ucode_cpu_info *uci; + struct cpio_data cp; + const char *path; + bool use_pa; + + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } + + if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + cp = find_microcode_in_initrd(path, use_pa); + + /* Needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_1_eax; + + *ret = cp; +} + +void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +{ + struct cpio_data cp = { }; + + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) + return; + + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); +} + +void load_ucode_amd_ap(unsigned int cpuid_1_eax) +{ + struct microcode_amd *mc; + struct cpio_data cp; + u32 *new_rev, rev, dummy; + + if (IS_ENABLED(CONFIG_X86_32)) { + mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + } else { + mc = (struct microcode_amd *)amd_ucode_patch; + new_rev = &ucode_new_rev; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + *new_rev = mc->hdr.patch_id; + return; + } + } + + __load_ucode_amd(cpuid_1_eax, &cp); + if (!(cp.data && cp.size)) + return; + + apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); + +int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) +{ + struct cont_desc desc = { 0 }; + enum ucode_state ret; + struct cpio_data cp; + + cp = find_microcode_in_initrd(ucode_path, false); + if (!(cp.data && cp.size)) + return -EINVAL; + + desc.cpuid_1_eax = cpuid_1_eax; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} + +void reload_ucode_amd(unsigned int cpu) +{ + u32 rev, dummy __always_unused; + struct microcode_amd *mc; + + mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + if (rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); + } + } +} +static u16 __find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + return find_equiv_id(&equiv_table, uci->cpu_sig.sig); +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, µcode_cache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, µcode_cache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) { + /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); + return; + } + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, µcode_cache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = __find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; + + csig->sig = cpuid_eax(0x00000001); + csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + + return 0; +} + +static enum ucode_state apply_microcode_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + enum ucode_state ret; + u32 rev, dummy __always_unused; + + BUG_ON(raw_smp_processor_id() != cpu); + + uci = ucode_cpu_info + cpu; + + p = find_patch(cpu); + if (!p) + return UCODE_NFOUND; + + mc_amd = p->data; + uci->mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* need to apply patch? */ + if (rev > mc_amd->hdr.patch_id) { + ret = UCODE_OK; + goto out; + } + + if (__apply_microcode_amd(mc_amd)) { + pr_err("CPU%d: update failed for patch_level=0x%08x\n", + cpu, mc_amd->hdr.patch_id); + return UCODE_ERROR; + } + + rev = mc_amd->hdr.patch_id; + ret = UCODE_UPDATED; + + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); + +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (c->cpu_index == boot_cpu_data.cpu_index) + boot_cpu_data.microcode = rev; + + return ret; +} + +static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) +{ + u32 equiv_tbl_len; + const u32 *hdr; + + if (!verify_equivalence_table(buf, buf_size, false)) + return 0; + + hdr = (const u32 *)buf; + equiv_tbl_len = hdr[2]; + + equiv_table.entry = vmalloc(equiv_tbl_len); + if (!equiv_table.entry) { + pr_err("failed to allocate equivalent CPU table\n"); + return 0; + } + + memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); + equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); + + /* add header length */ + return equiv_tbl_len + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ + vfree(equiv_table.entry); + memset(&equiv_table, 0, sizeof(equiv_table)); +} + +static void cleanup(void) +{ + free_equiv_cpu_table(); + free_cache(); +} + +/* + * Return a non-negative value even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, + unsigned int *patch_size) +{ + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + u16 proc_id; + int ret; + + ret = verify_patch(family, fw, leftover, patch_size, false); + if (ret) + return ret; + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kmemdup(fw + SECTION_HDR_SIZE, *patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + patch->size = *patch_size; + + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + + /* ... and add to cache. */ + update_cache(patch); + + return 0; +} + +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) +{ + u8 *fw = (u8 *)data; + size_t offset; + + offset = install_equiv_cpu_table(data, size); + if (!offset) + return UCODE_ERROR; + + fw += offset; + size -= offset; + + if (*(u32 *)fw != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + free_equiv_cpu_table(); + return UCODE_ERROR; + } + + while (size > 0) { + unsigned int crnt_size = 0; + int ret; + + ret = verify_and_add_patch(family, fw, size, &crnt_size); + if (ret < 0) + return UCODE_ERROR; + + fw += crnt_size + SECTION_HDR_SIZE; + size -= (crnt_size + SECTION_HDR_SIZE); + } + + return UCODE_OK; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + if (ret != UCODE_OK) { + cleanup(); + return ret; + } + + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); + + p = find_patch(cpu); + if (!p) + continue; + + if (c->microcode >= p->patch_id) + continue; + + ret = UCODE_NEW; + + memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); + memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + } + + return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw) + return UCODE_OK; + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + + if (request_firmware_direct(&fw, (const char *)fw_name, device)) { + pr_debug("failed to load file %s\n", fw_name); + goto out; + } + + ret = UCODE_ERROR; + if (!verify_container(fw->data, fw->size, false)) + goto fw_release; + + ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: + release_firmware(fw); + + out: + return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return UCODE_ERROR; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warn("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ + cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 000000000..24254d141 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,825 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * 2013-2016 Borislav Petkov + * + * X86 CPU microcode early update for Linux: + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + * (C) 2015 Borislav Petkov + * + * This driver allows to upgrade microcode on x86 processors. + */ + +#define pr_fmt(fmt) "microcode: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_VERSION "2.2" + +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = true; + +bool initrd_gone; + +LIST_HEAD(microcode_cache); + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - cpus_read_lock/unlock() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +/* + * Those patch levels cannot be updated to newer ones and thus should be final. + */ +static u32 final_levels[] = { + 0x01000098, + 0x0100009f, + 0x010000af, + 0, /* T-101 terminator */ +}; + +/* + * Check the current patch level on this CPU. + * + * Returns: + * - true: if update should stop + * - false: otherwise + */ +static bool amd_check_current_patch_level(void) +{ + u32 lvl, dummy, i; + u32 *levels; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); + + if (IS_ENABLED(CONFIG_X86_32)) + levels = (u32 *)__pa_nodebug(&final_levels); + else + levels = final_levels; + + for (i = 0; levels[i]; i++) { + if (lvl == levels[i]) + return true; + } + return false; +} + +static bool __init check_loader_disabled_bsp(void) +{ + static const char *__dis_opt_str = "dis_ucode_ldr"; + +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = __dis_opt_str; + bool *res = &dis_ucode_ldr; +#endif + + /* + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not + * completely accurate as xen pv guests don't see that CPUID bit set but + * that's good enough as they don't land on the BSP path anyway. + */ + if (native_cpuid_ecx(1) & BIT(31)) + return *res; + + if (x86_cpuid_vendor() == X86_VENDOR_AMD) { + if (amd_check_current_patch_level()) + return *res; + } + + if (cmdline_find_option_bool(cmdline, option) <= 0) + *res = false; + + return *res; +} + +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +bool get_builtin_firmware(struct cpio_data *cd, const char *name) +{ + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (!strcmp(name, b_fw->name)) { + cd->size = b_fw->size; + cd->data = b_fw->data; + return true; + } + } + return false; +} + +void __init load_ucode_bsp(void) +{ + unsigned int cpuid_1_eax; + bool intel = true; + + if (!have_cpuid_p()) + return; + + cpuid_1_eax = native_cpuid_eax(1); + + switch (x86_cpuid_vendor()) { + case X86_VENDOR_INTEL: + if (x86_family(cpuid_1_eax) < 6) + return; + break; + + case X86_VENDOR_AMD: + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; + break; + + default: + return; + } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return *((bool *)__pa_nodebug(&dis_ucode_ldr)); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + unsigned int cpuid_1_eax; + + if (check_loader_disabled_ap()) + return; + + cpuid_1_eax = native_cpuid_eax(1); + + switch (x86_cpuid_vendor()) { + case X86_VENDOR_INTEL: + if (x86_family(cpuid_1_eax) >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86_family(cpuid_1_eax) >= 0x10) + load_ucode_amd_ap(cpuid_1_eax); + break; + default: + break; + } +} + +static int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + int ret = -EINVAL; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + ret = save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); + break; + default: + break; + } + + initrd_gone = true; + + return ret; +} + +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Fixup the start address: after reserve_initrd() runs, initrd_start + * has the virtual address of the beginning of the initrd. It also + * possibly relocates the ramdisk. In either case, initrd_start contains + * the updated address so use that instead. + * + * initrd_gone is for the hotplug case where we've thrown out initrd + * already. + */ + if (!use_pa) { + if (initrd_gone) + return (struct cpio_data){ NULL, 0, "" }; + if (initrd_start) + start = initrd_start; + } else { + /* + * The picture with physical addresses is a bit different: we + * need to get the *physical* address to which the ramdisk was + * relocated, i.e., relocated_ramdisk (not initrd_start) and + * since we're running from physical addresses, we need to access + * relocated_ramdisk through its *physical* address too. + */ + u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); + if (*rr) + start = *rr; + } + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif +} + +void reload_early_microcode(unsigned int cpu) +{ + int vendor, family; + + vendor = x86_cpuid_vendor(); + family = x86_cpuid_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (family >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (family >= 0x10) + reload_ucode_amd(cpu); + break; + default: + break; + } +} + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +static void apply_microcode_local(void *arg) +{ + enum ucode_state *err = arg; + + *err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + enum ucode_state err; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1); + if (!ret) { + if (err == UCODE_ERROR) + ret = 1; + } + return ret; +} + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +#ifdef CONFIG_MICROCODE_LATE_LOADING +/* + * Late loading dance. Why the heavy-handed stomp_machine effort? + * + * - HT siblings must be idle and not execute other code while the other sibling + * is loading microcode in order to avoid any negative interactions caused by + * the loading. + * + * - In addition, microcode update on the cores must be serialized until this + * requirement can be relaxed in the future. Right now, this is conservative + * and good. + */ +#define SPINUNIT 100 /* 100 nsec */ + +static int check_online_cpus(void) +{ + unsigned int cpu; + + /* + * Make sure all CPUs are online. It's fine for SMT to be disabled if + * all the primary threads are still online. + */ + for_each_present_cpu(cpu) { + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { + pr_err("Not all CPUs online, aborting microcode update.\n"); + return -EINVAL; + } + } + + return 0; +} + +static atomic_t late_cpus_in; +static atomic_t late_cpus_out; + +static int __wait_for_cpus(atomic_t *t, long long timeout) +{ + int all_cpus = num_online_cpus(); + + atomic_inc(t); + + while (atomic_read(t) < all_cpus) { + if (timeout < SPINUNIT) { + pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", + all_cpus - atomic_read(t)); + return 1; + } + + ndelay(SPINUNIT); + timeout -= SPINUNIT; + + touch_nmi_watchdog(); + } + return 0; +} + +/* + * Returns: + * < 0 - on error + * 0 - success (no update done or microcode was updated) + */ +static int __reload_late(void *info) +{ + int cpu = smp_processor_id(); + enum ucode_state err; + int ret = 0; + + /* + * Wait for all CPUs to arrive. A load will not be attempted unless all + * CPUs show up. + * */ + if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) + return -1; + + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + apply_microcode_local(&err); + else + goto wait_for_siblings; + + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + ret = -1; + } + +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + + /* + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + apply_microcode_local(&err); + + return ret; +} + +/* + * Reload microcode late on all CPUs. Wait for a sec until they + * all gather together. + */ +static int microcode_reload_late(void) +{ + int old = boot_cpu_data.microcode, ret; + struct cpuinfo_x86 prev_info; + + atomic_set(&late_cpus_in, 0); + atomic_set(&late_cpus_out, 0); + + /* + * Take a snapshot before the microcode update in order to compare and + * check whether any bits changed after an update. + */ + store_cpu_caps(&prev_info); + + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (!ret) { + pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); + microcode_check(&prev_info); + } else { + pr_info("Reload failed, current microcode revision: 0x%x\n", + boot_cpu_data.microcode); + } + + return ret; +} + +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + enum ucode_state tmp_ret = UCODE_OK; + int bsp = boot_cpu_data.cpu_index; + unsigned long val; + ssize_t ret = 0; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return size; + + cpus_read_lock(); + + ret = check_online_cpus(); + if (ret) + goto put; + + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + + mutex_lock(µcode_mutex); + ret = microcode_reload_late(); + mutex_unlock(µcode_mutex); + +put: + cpus_read_unlock(); + + if (ret == 0) + ret = size; + + return ret; +} + +static DEVICE_ATTR_WO(reload); +#endif + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR(version, 0444, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, + NULL +}; + +static const struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_resume_cpu(int cpu) +{ + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; + + pr_debug("CPU%d updated upon resume\n", cpu); + + return UCODE_OK; +} + +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) +{ + enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid) + return UCODE_OK; + + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw); + if (ustate == UCODE_NEW) { + pr_debug("CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); + } + + return ustate; +} + +static enum ucode_state microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + /* Refresh CPU microcode revision after resume. */ + collect_cpu_info(cpu); + + if (uci->valid) + return microcode_resume_cpu(cpu); + + return microcode_init_cpu(cpu, false); +} + +static int mc_device_add(struct device *dev, struct subsys_interface *sif) +{ + int err, cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d added\n", cpu); + + err = sysfs_create_group(&dev->kobj, &mc_attr_group); + if (err) + return err; + + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) + return -EINVAL; + + return err; +} + +static void mc_device_remove(struct device *dev, struct subsys_interface *sif) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return; + + pr_debug("CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&dev->kobj, &mc_attr_group); +} + +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, +}; + +/** + * microcode_bsp_resume - Update boot CPU microcode during resume. + */ +void microcode_bsp_resume(void) +{ + int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + else if (!uci->mc) + reload_early_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { + .resume = microcode_bsp_resume, +}; + +static int mc_cpu_starting(unsigned int cpu) +{ + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); + return 0; +} + +static int mc_cpu_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + return 0; +} + +static int mc_cpu_down_prep(unsigned int cpu) +{ + struct device *dev; + + dev = get_cpu_device(cpu); + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); + + return 0; +} + +static struct attribute *cpu_root_microcode_attrs[] = { +#ifdef CONFIG_MICROCODE_LATE_LOADING + &dev_attr_reload.attr, +#endif + NULL +}; + +static const struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + +int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + int error; + + if (dis_ucode_ldr) + return -EINVAL; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + else + pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) + return -ENODEV; + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + cpus_read_lock(); + mutex_lock(µcode_mutex); + + error = subsys_interface_register(&mc_cpu_interface); + if (!error) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + cpus_read_unlock(); + + if (error) + goto out_pdev; + + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + + register_syscore_ops(&mc_syscore_ops); + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", + mc_cpu_starting, NULL); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); + + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); + + return 0; + + out_driver: + cpus_read_lock(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + cpus_read_unlock(); + + out_pdev: + platform_device_unregister(microcode_pdev); + return error; + +} +fs_initcall(save_microcode_in_initrd); +late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 000000000..1ba590e6e --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,996 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu + * H Peter Anvin" + */ + +/* + * This needs to be before all headers so that pr_debug in printk.h doesn't turn + * printk calls into no_printk(). + * + *#define DEBUG + */ +#define pr_fmt(fmt) "microcode: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; + +/* Current microcode patch used in early patching on the APs. */ +static struct microcode_intel *intel_ucode_patch; + +/* last level cache size per core */ +static int llc_size_per_core; + +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) +{ + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; +} + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int find_matching_signature(void *mc, unsigned int csig, int cpf) +{ + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; + int i; + + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; + + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; + + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; + + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) +{ + struct microcode_header_intel *mc_hdr = mc; + + if (mc_hdr->rev <= new_rev) + return 0; + + return find_matching_signature(mc, csig, cpf); +} + +static struct ucode_patch *memdup_patch(void *data, unsigned int size) +{ + struct ucode_patch *p; + + p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); + if (!p) + return NULL; + + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return NULL; + } + + return p; +} + +static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p = NULL; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (find_matching_signature(data, sig, pf)) { + prev_found = true; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + p = memdup_patch(data, size); + if (!p) + pr_err("Error allocating buffer %p\n", data); + else { + list_replace(&iter->plist, &p->plist); + kfree(iter->data); + kfree(iter); + } + } + } + + /* + * There weren't any previous patches found in the list cache; save the + * newly found. + */ + if (!prev_found) { + p = memdup_patch(data, size); + if (!p) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } + + if (!p) + return; + + if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + return; + + /* + * Save for early loading. On 32-bit, that needs to be a physical + * address as the APs are running from physical addresses, before + * paging has been enabled. + */ + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; +} + +static int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format.\n"); + return -EINVAL; + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; + + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; + } + + if (!ext_table_size) + return 0; + + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) +{ + struct microcode_header_intel *mc_header; + struct microcode_intel *patch = NULL; + unsigned int mc_size; + + while (size) { + if (size < sizeof(struct microcode_header_intel)) + break; + + mc_header = (struct microcode_header_intel *)data; + + mc_size = get_totalsize(mc_header); + if (!mc_size || + mc_size > size || + microcode_sanity_check(data, 0) < 0) + break; + + size -= mc_size; + + if (!find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { + data += mc_size; + continue; + } + + if (save) { + save_microcode_patch(uci, data, mc_size); + goto next; + } + + + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; + + } else { + struct microcode_header_intel *phdr = &patch->hdr; + + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } + + /* We have a newer patch, save it. */ + patch = data; + +next: + data += mc_size; + } + + if (size) + return NULL; + + return patch; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if ((model >= 5) || (family > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +static void show_saved_mc(void) +{ +#ifdef DEBUG + int i = 0, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + struct ucode_patch *p; + + if (list_empty(µcode_cache)) { + pr_debug("no microcode data saved.\n"); + return; + } + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); + + list_for_each_entry(p, µcode_cache, plist) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + struct extended_signature *ext_sig; + int ext_sigcount; + + mc_saved_header = (struct microcode_header_intel *)p->data; + + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + date = mc_saved_header->date; + + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", + i++, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + } +#endif +} + +/* + * Save this microcode patch. It will be loaded early when a CPU is + * hot-added or resumes. + */ +static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size) +{ + /* Synchronization during CPU hotplug. */ + static DEFINE_MUTEX(x86_cpu_microcode_mutex); + + mutex_lock(&x86_cpu_microcode_mutex); + + save_microcode_patch(uci, mc, size); + show_saved_mc(); + + mutex_unlock(&x86_cpu_microcode_mutex); +} + +static bool load_builtin_intel_microcode(struct cpio_data *cp) +{ + unsigned int eax = 1, ebx, ecx = 0, edx; + char name[30]; + + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + x86_family(eax), x86_model(eax), x86_stepping(eax)); + + return get_builtin_firmware(cp, name); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc = uci->mc; + if (!mc) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc->hdr.date; +} +#else + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc; + + mc = uci->mc; + if (!mc) + return; + + print_ucode_info(uci, mc->hdr.date); +} +#endif + +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) +{ + struct microcode_intel *mc; + u32 rev; + + mc = uci->mc; + if (!mc) + return 0; + + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + return UCODE_OK; + } + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + + /* write microcode via MSR 0x79 */ + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + + rev = intel_get_microcode_revision(); + if (rev != mc->hdr.rev) + return -1; + + uci->cpu_sig.rev = rev; + + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc->hdr.date); + + return 0; +} + +int __init save_microcode_in_initrd_intel(void) +{ + struct ucode_cpu_info uci; + struct cpio_data cp; + + /* + * initrd is going away, clear patch ptr. We will scan the microcode one + * last time before jettisoning and save a patch, if found. Then we will + * update that pointer too, with a stable patch address to use when + * resuming the cores. + */ + intel_ucode_patch = NULL; + + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); + + if (!(cp.data && cp.size)) + return 0; + + collect_cpu_info_early(&uci); + + scan_microcode(cp.data, cp.size, &uci, true); + + show_saved_mc(); + + return 0; +} + +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +{ + static const char *path; + struct cpio_data cp; + bool use_pa; + + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; + } + + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); + + if (!(cp.data && cp.size)) + return NULL; + + collect_cpu_info_early(uci); + + return scan_microcode(cp.data, cp.size, uci, false); +} + +void __init load_ucode_intel_bsp(void) +{ + struct microcode_intel *patch; + struct ucode_cpu_info uci; + + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + uci.mc = patch; + + apply_microcode_early(&uci, true); +} + +void load_ucode_intel_ap(void) +{ + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; + + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } + + uci.mc = *iup; + + apply_microcode_early(&uci, true); +} + +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) +{ + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + + phdr = (struct microcode_header_intel *)iter->data; + + if (phdr->rev <= uci->cpu_sig.rev) + continue; + + if (!find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; + + return iter->data; + } + return NULL; +} + +void reload_ucode_intel(void) +{ + struct microcode_intel *p; + struct ucode_cpu_info uci; + + collect_cpu_info_early(&uci); + + p = find_patch(&uci); + if (!p) + return; + + uci.mc = p; + + apply_microcode_early(&uci, false); +} + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + static struct cpu_signature prev; + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + csig->rev = c->microcode; + + /* No extra locking on prev, races are harmless. */ + if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { + pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", + csig->sig, csig->pf, csig->rev); + prev = *csig; + } + + return 0; +} + +static enum ucode_state apply_microcode_intel(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; + struct microcode_intel *mc; + enum ucode_state ret; + static int prev_rev; + u32 rev; + + /* We should bind the task to the CPU */ + if (WARN_ON(raw_smp_processor_id() != cpu)) + return UCODE_ERROR; + + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) { + mc = uci->mc; + if (!mc) + return UCODE_NFOUND; + } + + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + ret = UCODE_OK; + goto out; + } + + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + + /* write microcode via MSR 0x79 */ + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + + rev = intel_get_microcode_revision(); + + if (rev != mc->hdr.rev) { + pr_err("CPU%d update to revision 0x%x failed\n", + cpu, mc->hdr.rev); + return UCODE_ERROR; + } + + if (bsp && rev != prev_rev) { + pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", + rev, + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + prev_rev = rev; + } + + ret = UCODE_UPDATED; + +out: + uci->cpu_sig.rev = rev; + c->microcode = rev; + + /* Update boot_cpu_data's revision too, if we're on the BSP: */ + if (bsp) + boot_cpu_data.microcode = rev; + + return ret; +} + +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + unsigned int curr_mc_size = 0, new_mc_size = 0; + enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; + + while (iov_iter_count(iter)) { + struct microcode_header_intel mc_header; + unsigned int mc_size, data_size; + u8 *data; + + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); + break; + } + + mc_size = get_totalsize(&mc_header); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); + break; + } + + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } + + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || + microcode_sanity_check(mc, 1) < 0) { + break; + } + + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (has_newer_microcode(mc, csig, cpf, new_rev)) { + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + new_mc_size = mc_size; + mc = NULL; /* trigger new vmalloc */ + ret = UCODE_NEW; + } + } + + vfree(mc); + + if (iov_iter_count(iter)) { + vfree(new_mc); + return UCODE_ERROR; + } + + if (!new_mc) + return UCODE_NFOUND; + + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(uci, new_mc, new_mc_size); + + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + + return ret; +} + +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_stepping == 0x01 && + llc_size_per_core > 2621440 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); + return true; + } + + return false; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + struct iov_iter iter; + enum ucode_state ret; + struct kvec kvec; + char name[30]; + + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_stepping); + + if (request_firmware_direct(&firmware, name, device)) { + pr_debug("data file %s load failed\n", name); + return UCODE_NFOUND; + } + + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); + + release_firmware(firmware); + + return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + struct iov_iter iter; + struct iovec iov; + + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); +} + +static struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode_intel, +}; + +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024ULL; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + +struct microcode_ops * __init init_intel_microcode(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + llc_size_per_core = calc_llc_size_per_core(c); + + return µcode_intel_ops; +} diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 000000000..1db560ed2 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h +# + +set -e + +OUT=$1 + +dump_array() +{ + ARRAY=$1 + SIZE=$2 + PFX=$3 + POSTFIX=$4 + IN=$5 + + PFX_SZ=$(echo $PFX | wc -c) + TABS="$(printf '\t\t\t\t\t')" + + echo "const char * const $ARRAY[$SIZE] = {" + + # Iterate through any input lines starting with #define $PFX + sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" = '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + if [ -n "$POSTFIX" ]; then + T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 )) + TABS="$(printf '\t\t\t\t\t\t')" + TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE" + else + TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE" + fi + done + echo "};" +} + +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include " + echo "#endif" + echo "" + + dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2 + echo "" + + dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2 + echo "" + + echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES" + echo "#ifndef _ASM_X86_VMXFEATURES_H" + echo "#include " + echo "#endif" + dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3 + echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c new file mode 100644 index 000000000..021cd0677 --- /dev/null +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HyperV Detection code. + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); + +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); +static void (*hv_stimer0_handler)(void); +static void (*hv_kexec_handler)(void); +static void (*hv_crash_handler)(struct pt_regs *regs); + +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + + set_irq_regs(old_regs); +} + +int hv_setup_vmbus_irq(int irq, void (*handler)(void)) +{ + /* + * The 'irq' argument is ignored on x86/x64 because a hard-coded + * interrupt vector is used for Hyper-V interrupts. + */ + vmbus_handler = handler; + return 0; +} + +void hv_remove_vmbus_irq(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); + +/* + * Routines to do per-architecture handling of stimer0 + * interrupts when in Direct Mode + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(hyperv_stimer0_count); + if (hv_stimer0_handler) + hv_stimer0_handler(); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); + ack_APIC_irq(); + + set_irq_regs(old_regs); +} + +int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) +{ + *vector = HYPERV_STIMER0_VECTOR; + *irq = -1; /* Unused on x86/x64 */ + hv_stimer0_handler = handler; + return 0; +} +EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq); + +void hv_remove_stimer0_irq(int irq) +{ + /* We have no way to deallocate the interrupt gate */ + hv_stimer0_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq); + +void hv_setup_kexec_handler(void (*handler)(void)) +{ + hv_kexec_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); + +void hv_remove_kexec_handler(void) +{ + hv_kexec_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); + +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) +{ + hv_crash_handler = handler; +} +EXPORT_SYMBOL_GPL(hv_setup_crash_handler); + +void hv_remove_crash_handler(void) +{ + hv_crash_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_remove_crash_handler); + +#ifdef CONFIG_KEXEC_CORE +static void hv_machine_shutdown(void) +{ + if (kexec_in_progress && hv_kexec_handler) + hv_kexec_handler(); + + /* + * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor + * corrupts the old VP Assist Pages and can crash the kexec kernel. + */ + if (kexec_in_progress && hyperv_init_cpuhp > 0) + cpuhp_remove_state(hyperv_init_cpuhp); + + /* The function calls stop_other_cpus(). */ + native_machine_shutdown(); + + /* Disable the hypercall page when there is only 1 active CPU. */ + if (kexec_in_progress) + hyperv_cleanup(); +} + +static void hv_machine_crash_shutdown(struct pt_regs *regs) +{ + if (hv_crash_handler) + hv_crash_handler(regs); + + /* The function calls crash_smp_send_stop(). */ + native_machine_crash_shutdown(regs); + + /* Disable the hypercall page when there is only 1 active CPU. */ + hyperv_cleanup(); +} +#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_HYPERV */ + +static uint32_t __init ms_hyperv_platform(void) +{ + u32 eax; + u32 hyp_signature[3]; + + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, + &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); + + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; +} + +static unsigned char hv_get_nmi_reason(void) +{ + return 0; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes + * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * unknown NMI on the first CPU which gets it. + */ +static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) +{ + static atomic_t nmi_cpu = ATOMIC_INIT(-1); + + if (!unknown_nmi_panic) + return NMI_DONE; + + if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + return NMI_HANDLED; + + return NMI_DONE; +} +#endif + +static unsigned long hv_get_tsc_khz(void) +{ + unsigned long freq; + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + + return freq / 1000; +} + +#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV) +static void __init hv_smp_prepare_boot_cpu(void) +{ + native_smp_prepare_boot_cpu(); +#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS) + hv_init_spinlocks(); +#endif +} +#endif + +static void __init ms_hyperv_init_platform(void) +{ + int hv_host_info_eax; + int hv_host_info_ebx; + int hv_host_info_ecx; + int hv_host_info_edx; + +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + + /* + * Extract the features and hints + */ + ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); + ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + + pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); + + ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + + /* + * Extract host information. + */ + if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= + HYPERV_CPUID_VERSION) { + hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); + hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); + hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); + hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); + + pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", + hv_host_info_eax, hv_host_info_ebx >> 16, + hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, + hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); + } + + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + x86_platform.calibrate_tsc = hv_get_tsc_khz; + x86_platform.calibrate_cpu = hv_get_tsc_khz; + } + + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + ms_hyperv.nested_features = + cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + } + + /* + * Hyper-V expects to get crash register data or kmsg when + * crash enlightment is available and system crashes. Set + * crash_kexec_post_notifiers to be true to make sure that + * calling crash enlightment interface before running kdump + * kernel. + */ + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) + crash_kexec_post_notifiers = true; + +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_period = hv_lapic_frequency; + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", + lapic_timer_period); + } + + register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, + "hv_nmi_unknown"); +#endif + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + +#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) + machine_ops.shutdown = hv_machine_shutdown; + machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } + + /* + * Generation 2 instances don't support reading the NMI status from + * 0x61 port. + */ + if (efi_enabled(EFI_BOOT)) + x86_platform.get_nmi_reason = hv_get_nmi_reason; + + /* + * Hyper-V VMs have a PIT emulation quirk such that zeroing the + * counter register during PIT shutdown restarts the PIT. So it + * continues to interrupt @18.2 HZ. Setting i8253_clear_counter + * to false tells pit_shutdown() not to zero the counter so that + * the PIT really is shutdown. Generation 2 VMs don't have a PIT, + * and setting this value has no effect. + */ + i8253_clear_counter_on_shutdown = false; + +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Setup the hook to get control post apic initialization. + */ + x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + asm_sysvec_hyperv_reenlightenment); + } + + /* Setup the IDT for stimer0 */ + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { + alloc_intr_gate(HYPERV_STIMER0_VECTOR, + asm_sysvec_hyperv_stimer0); + } + +# ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; +# endif + + /* + * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, + * set x2apic destination mode to physcial mode when x2apic is available + * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs + * have 8-bit APIC id. + */ +# ifdef CONFIG_X86_X2APIC + if (x2apic_supported()) + x2apic_phys = 1; +# endif + + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); +#endif + /* + * TSC should be marked as unstable only after Hyper-V + * clocksource has been initialized. This ensures that the + * stability of the sched_clock is not altered. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + mark_tsc_unstable("running on Hyper-V"); +} + +const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft Hyper-V", + .detect = ms_hyperv_platform, + .type = X86_HYPER_MS_HYPERV, + .init.init_platform = ms_hyperv_init_platform, +}; diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile new file mode 100644 index 000000000..cc4f9f1cb --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-y := mtrr.o if.o generic.o cleanup.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o + diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c new file mode 100644 index 000000000..a65a02720 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "mtrr.h" + +static void +amd_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + unsigned long low, high; + + rdmsr(MSR_K6_UWCCR, low, high); + /* Upper dword is region 1, lower is region 0 */ + if (reg == 1) + low = high; + /* The base masks off on the right alignment */ + *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *type = 0; + if (low & 1) + *type = MTRR_TYPE_UNCACHABLE; + if (low & 2) + *type = MTRR_TYPE_WRCOMB; + if (!(low & 3)) { + *size = 0; + return; + } + /* + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. + * + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. + * + * eg 111 1111 1111 1100 is 512K + * + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... + */ + low = (~low) & 0x1FFFC; + *size = (low + 4) << (15 - PAGE_SHIFT); +} + +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + u32 regs[2]; + + /* + * Low is MTRR0, High MTRR 1 + */ + rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); + /* + * Blank to disable + */ + if (size == 0) { + regs[reg] = 0; + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ + regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) + | (base << PAGE_SHIFT) | (type + 1); + } + + /* + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr + */ + wbinvd(); + wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); +} + +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + || (size & ~(size - 1)) - size || (base & (size - 1))) + return -EINVAL; + return 0; +} + +static const struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, + .get = amd_get_mtrr, + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init amd_init_mtrr(void) +{ + set_mtrr_ops(&amd_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c new file mode 100644 index 000000000..f27177816 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +#include "mtrr.h" + +static struct { + unsigned long high; + unsigned long low; +} centaur_mcr[8]; + +static u8 centaur_mcr_reserved; +static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ + +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. + */ +static int +centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + if (centaur_mcr_reserved & (1 << i)) + continue; + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +{ + centaur_mcr[mcr].low = lo; + centaur_mcr[mcr].high = hi; +} + +static void +centaur_get_mcr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + *base = centaur_mcr[reg].high >> PAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) + *type = MTRR_TYPE_UNCACHABLE; + if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) + *type = MTRR_TYPE_WRBACK; + if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) + *type = MTRR_TYPE_WRBACK; +} + +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long low, high; + + if (size == 0) { + /* Disable */ + high = low = 0; + } else { + high = base << PAGE_SHIFT; + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { + if (type == MTRR_TYPE_UNCACHABLE) + low = -size << PAGE_SHIFT | 0x02; /* NC */ + else + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ + } + } + centaur_mcr[reg].high = high; + centaur_mcr[reg].low = low; + wrmsr(MSR_IDT_MCR0 + reg, low, high); +} + +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * FIXME: Winchip2 supports uncached + */ + if (type != MTRR_TYPE_WRCOMB && + (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { + pr_warn("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); + return -EINVAL; + } + return 0; +} + +static const struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, + .get = centaur_get_mcr, + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init centaur_init_mtrr(void) +{ + set_mtrr_ops(¢aur_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c new file mode 100644 index 000000000..5bd011737 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -0,0 +1,987 @@ +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mtrr.h" + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0) + +#define BIOS_BUG_MSG \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" + +static int __init +x86_get_mtrr_mem_range(struct range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long base, size; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size); + } + if (debug_print) { + pr_debug("After WB checking\n"); + for (i = 0; i < nr_range; i++) + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + + /* Take out UC ranges: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + /* Var MTRR contains UC entry below 1M? Skip it: */ + pr_warn(BIOS_BUG_MSG, i); + if (base + size <= (1<<(20-PAGE_SHIFT))) + continue; + size -= (1<<(20-PAGE_SHIFT)) - base; + base = 1<<(20-PAGE_SHIFT); + } + subtract_range(range, RANGE_NUM, base, base + size); + } + if (extra_remove_size) + subtract_range(range, RANGE_NUM, extra_remove_base, + extra_remove_base + extra_remove_size); + + if (debug_print) { + pr_debug("After UC checking\n"); + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + } + + /* sort the ranges */ + nr_range = clean_sort_range(range, RANGE_NUM); + if (debug_print) { + pr_debug("After sorting\n"); + for (i = 0; i < nr_range; i++) + pr_debug("MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + + return nr_range; +} + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct range *range, int nr_range) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < nr_range; i++) + sum += range[i].end - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); + +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned int address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + u64 base, mask; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); + + base = ((u64)basek) << 10; + + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; + + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + unsigned long base = sizek; + char factor; + + if (base & ((1<<10) - 1)) { + /* Not MB-aligned: */ + factor = 'K'; + } else if (base & ((1<<20) - 1)) { + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + + /* Compute the maximum size with which we can make a range: */ + if (range_startk) + max_align = __ffs(range_startk); + else + max_align = BITS_PER_LONG - 1; + + align = __fls(range_sizek); + if (align > max_align) + align = max_align; + + sizek = 1UL << align; + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); + + Dprintk("Setting variable MTRR %d, " + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + save_var_mtrr(reg++, range_startk, sizek, type); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long second_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + second_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* Align with gran size, prevent small block used up MTRRs: */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return second_sizek; + + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); + + while (range_sizek > state->range_sizek) { + range_sizek -= gran_sizek; + if (!range_sizek) + return 0; + } + state->range_sizek = range_sizek; + + /* Try to append some small hole: */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* No increase: */ + if (range0_sizek == state->range_sizek) { + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + /* Only cut back when it is not the last: */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* One hole in the middle: */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* One hole in middle or at the end: */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* Hole size should be less than half of range0 size: */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } + } + + if (range0_sizek) { + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { + /* Need to handle left over range: */ + range_sizek = state->range_sizek - range0_sizek; + + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; +} + +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) +{ + unsigned long basek, sizek; + unsigned long second_sizek = 0; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range: */ + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs: */ + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); + + /* Allocate an msr: */ + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; +} + +/* Mininum size of mtrr block that can take hole: */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* Granularity of mtrr of block: */ +static u64 mtrr_gran_size __initdata; + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static unsigned long nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init +x86_setup_var_mtrrs(struct range *range, int nr_range, + u64 chunk_size, u64 gran_size) +{ + struct var_mtrr_state var_state; + int num_reg; + int i; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); + + /* Write the range: */ + for (i = 0; i < nr_range; i++) { + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start); + } + + /* Write the last range: */ + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + + num_reg = var_state.reg; + /* Clear out the extra MTRR's: */ + while (var_state.reg < num_var_ranges) { + save_var_mtrr(var_state.reg, 0, 0, 0); + var_state.reg++; + } + + return num_reg; +} + +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; + +/* + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 + */ +#define NUM_RESULT 136 +#define PSHIFT (PAGE_SHIFT - 10) + +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; + +static void __init print_out_mtrr_range_state(void) +{ + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* Extra one for all 0: */ + int num[MTRR_NUM_TYPES + 1]; + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + size = range_state[i].size_pfn; + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* Check if we got UC entries: */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* Check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + return 1; +} + +static unsigned long __initdata range_sums; + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) +{ + /* + * range_new should really be an automatic variable, but + * putting 4096 bytes on the stack is frowned upon, to put it + * mildly. It is safe to make it a static __initdata variable, + * since mtrr_calc_range_state is only called during init and + * there's no way it will call itself recursively. + */ + static struct range range_new[RANGE_NUM] __initdata; + unsigned long range_sums_new; + int nr_range_new; + int num_reg; + + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + + /* We got new setting in range_state, check it: */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + x_remove_base, x_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } + + /* Double check it: */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; +} + +static void __init mtrr_print_out_one_result(int i) +{ + unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int num_reg_good; + int index_good; + int i; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + +int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long x_remove_base, x_remove_size; + unsigned long base, size, def, dummy; + u64 chunk_size, gran_size; + mtrr_type type; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Check if we need handle it and can handle it: */ + if (!mtrr_need_cleanup()) + return 0; + + /* Print original var MTRRs at first, for debugging: */ + pr_debug("original variable MTRRs\n"); + print_out_mtrr_range_state(); + + memset(range, 0, sizeof(range)); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + /* + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: + */ + nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0, + 1ULL<<(20 - PAGE_SHIFT)); + /* add from var mtrr at last */ + nr_range = x86_get_mtrr_mem_range(range, nr_range, + x_remove_base, x_remove_size); + + range_sums = sum_ranges(range, nr_range); + pr_info("total RAM covered: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + x_remove_base, x_remove_size, i); + + mtrr_print_out_one_result(i); + + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + pr_debug("New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } + pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n"); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); + chunk_size <<= 1) { + + if (i >= NUM_RESULT) + continue; + + mtrr_calc_range_state(chunk_size, gran_size, + x_remove_base, x_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + pr_info("\n"); + } + + i++; + } + } + + /* Try to find the optimal index: */ + index_good = mtrr_search_optimal_index(); + + if (index_good != -1) { + pr_info("Found optimal setting for mtrr clean up\n"); + i = index_good; + mtrr_print_out_one_result(i); + + /* Convert ranges to var ranges state: */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + pr_debug("New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); + } + + pr_info("mtrr_cleanup: can not find optimal value\n"); + pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; +} +#else +int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return 0; + if (boot_cpu_data.x86 < 0xf) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through: */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + + return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED); +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_pfn = 0, def, dummy; + mtrr_type type; + u64 total_trim_size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + if (highest_pfn < base + size) + highest_pfn = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ + if (!highest_pfn) { + pr_info("CPU MTRRs all blank - virtualized system.\n"); + return 0; + } + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type >= MTRR_NUM_TYPES) + continue; + size = range_state[i].size_pfn; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* No entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* Check if we only had WB and UC: */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + /* Check the head: */ + total_trim_size = 0; + if (range[0].start) + total_trim_size += real_trim_memory(0, range[0].start); + + /* Check the holes: */ + for (i = 0; i < nr_range - 1; i++) { + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, + range[i+1].start); + } + + /* Check the top: */ + i = nr_range - 1; + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, + end_pfn); + + if (total_trim_size) { + pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", + total_trim_size >> 20); + + if (!changed_by_mtrr_cleanup) + WARN_ON(1); + + pr_info("update e820 for mtrr\n"); + e820__update_table_print(); + + return 1; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c new file mode 100644 index 000000000..ca670919b --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include +#include +#include + +#include "mtrr.h" + +static void +cyrix_get_arr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned char arr, ccr3, rcr, shift; + unsigned long flags; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + local_irq_save(flags); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); + rcr = getCx86(CX86_RCR_BASE + reg); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + local_irq_restore(flags); + + shift = ((unsigned char *) base)[1] & 0x0f; + *base >>= PAGE_SHIFT; + + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + * Note: shift==0xf means 4G, this is unsupported. + */ + if (shift) + *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); + else + *size = 0; + + /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ + if (reg < 7) { + switch (rcr) { + case 1: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRBACK; + break; + case 9: + *type = MTRR_TYPE_WRCOMB; + break; + case 24: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } else { + switch (rcr) { + case 0: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRCOMB; + break; + case 9: + *type = MTRR_TYPE_WRBACK; + break; + case 25: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } +} + +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ +static int +cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i; + + switch (replace_reg) { + case 7: + if (size < 0x40) + break; + fallthrough; + case 6: + case 5: + case 4: + return replace_reg; + case 3: + case 2: + case 1: + case 0: + return replace_reg; + } + /* If we are to set up a region >32M then look at ARR7 immediately */ + if (size > 0x2000) { + cyrix_get_arr(7, &lbase, &lsize, <ype); + if (lsize == 0) + return 7; + /* Else try ARR0-ARR6 first */ + } else { + for (i = 0; i < 7; i++) { + cyrix_get_arr(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((lsize == 0) && (size >= 0x40)) + return i; + } + return -ENOSPC; +} + +static u32 cr4, ccr3; + +static void prepare_set(void) +{ + u32 cr0; + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ + cr0 = read_cr0() | X86_CR0_CD; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); +} + +static void post_set(void) +{ + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (boot_cpu_has(X86_FEATURE_PGE)) + __write_cr4(cr4); +} + +static void cyrix_set_arr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned char arr, arr_type, arr_size; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ + if (reg >= 7) + size >>= 6; + + size &= 0x7fff; /* make sure arr_size <= 14 */ + for (arr_size = 0; size; arr_size++, size >>= 1) + ; + + if (reg < 7) { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 1; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 9; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 24; + break; + default: + arr_type = 8; + break; + } + } else { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 0; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 8; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 25; + break; + default: + arr_type = 9; + break; + } + } + + prepare_set(); + + base <<= PAGE_SHIFT; + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); + setCx86(CX86_RCR_BASE + reg, arr_type); + + post_set(); +} + +typedef struct { + unsigned long base; + unsigned long size; + mtrr_type type; +} arr_state_t; + +static arr_state_t arr_state[8] = { + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} +}; + +static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; + +static void cyrix_set_all(void) +{ + int i; + + prepare_set(); + + /* the CCRs are not contiguous */ + for (i = 0; i < 4; i++) + setCx86(CX86_CCR0 + i, ccr_state[i]); + for (; i < 7; i++) + setCx86(CX86_CCR4 + i, ccr_state[i]); + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, + arr_state[i].size, arr_state[i].type); + } + + post_set(); +} + +static const struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, + .set = cyrix_set_arr, + .get = cyrix_get_arr, + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init cyrix_init_mtrr(void) +{ + set_mtrr_ops(&cyrix_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c new file mode 100644 index 000000000..a29997e6c --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -0,0 +1,925 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span up to 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mtrr.h" + +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + {} +}; + +static unsigned long smp_changes_mask; +static int mtrr_state_set; +u64 mtrr_tom2; + +struct mtrr_state_type mtrr_state; +EXPORT_SYMBOL_GPL(mtrr_state); + +/* + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" (26094 Rev. 3.30 February 2006), section + * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set + * to 1 during BIOS initialization of the fixed MTRRs, then cleared to + * 0 for operation." + */ +static inline void k8_check_syscfg_dram_mod_en(void) +{ + u32 lo, hi; + + if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0x0f))) + return; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { + pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + " not cleared by BIOS, clearing this bit\n", + smp_processor_id()); + lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; + mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + } +} + +/* Get the size of contiguous MTRR range */ +static u64 get_mtrr_size(u64 mask) +{ + u64 size; + + mask >>= PAGE_SHIFT; + mask |= size_or_mask; + size = -mask; + size <<= PAGE_SHIFT; + return size; +} + +/* + * Check and return the effective type for MTRR-MTRR type overlap. + * Returns 1 if the effective type is UNCACHEABLE, else returns 0 + */ +static int check_type_overlap(u8 *prev, u8 *curr) +{ + if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || + (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { + *prev = MTRR_TYPE_WRTHROUGH; + *curr = MTRR_TYPE_WRTHROUGH; + } + + if (*prev != *curr) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + return 0; +} + +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched + */ +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Arguments: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. + */ +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat, u8 *uniform) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + + *repeat = 0; + *uniform = 1; + + prev_match = MTRR_TYPE_INVALID; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state, inclusive; + + if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) + + (mtrr_state.var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) + + (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); + + if ((start_state != end_state) || inclusive) { + /* + * We have start:end spanning across an MTRR. + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * + * depending on kind of overlap. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * + * Note: This way we handle overlaps with multiple + * entries and the default type properly. + */ + if (start_state) + *partial_end = base + get_mtrr_size(mask); + else + *partial_end = base; + + if (unlikely(*partial_end <= start)) { + WARN_ON(1); + *partial_end = start + PAGE_SIZE; + } + + end = *partial_end - 1; /* end is inclusive */ + *repeat = 1; + *uniform = 0; + } + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; + if (prev_match == MTRR_TYPE_INVALID) { + prev_match = curr_match; + continue; + } + + *uniform = 0; + if (check_type_overlap(&prev_match, &curr_match)) + return curr_match; + } + + if (prev_match != MTRR_TYPE_INVALID) + return prev_match; + + return mtrr_state.def_type; +} + +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. + */ +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) +{ + u8 type, prev_type, is_uniform = 1, dummy; + int repeat; + u64 partial_end; + + /* Make end inclusive instead of exclusive */ + end--; + + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); + + /* + * Common path is with repeat = 0. + * However, we can have cases where [start:end] spans across some + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. + */ + while (repeat) { + prev_type = type; + start = partial_end; + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); + + if (check_type_overlap(&prev_type, &type)) + goto out; + } + + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + type = MTRR_TYPE_WRBACK; + +out: + *uniform = is_uniform; + return type; +} + +/* Get the MSR pair relating to a var range */ +static void +get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) +{ + rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); +} + +/* Fill the MSR pair relating to a var range */ +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) +{ + struct mtrr_var_range *vr; + + vr = mtrr_state.var_ranges; + + vr[index].base_lo = base_lo; + vr[index].base_hi = base_hi; + vr[index].mask_lo = mask_lo; + vr[index].mask_hi = mask_hi; +} + +static void get_fixed_ranges(mtrr_type *frs) +{ + unsigned int *p = (unsigned int *)frs; + int i; + + k8_check_syscfg_dram_mod_en(); + + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); + + for (i = 0; i < 2; i++) + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); + for (i = 0; i < 8; i++) + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); +} + +void mtrr_save_fixed_ranges(void *info) +{ + if (boot_cpu_has(X86_FEATURE_MTRR)) + get_fixed_ranges(mtrr_state.fixed_ranges); +} + +static unsigned __initdata last_fixed_start; +static unsigned __initdata last_fixed_end; +static mtrr_type __initdata last_fixed_type; + +static void __init print_fixed_last(void) +{ + if (!last_fixed_end) + return; + + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + + last_fixed_end = 0; +} + +static void __init update_fixed_last(unsigned base, unsigned end, + mtrr_type type) +{ + last_fixed_start = base; + last_fixed_end = end; + last_fixed_type = type; +} + +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) +{ + unsigned i; + + for (i = 0; i < 8; ++i, ++types, base += step) { + if (last_fixed_end == 0) { + update_fixed_last(base, base + step, *types); + continue; + } + if (last_fixed_end == base && last_fixed_type == *types) { + last_fixed_end = base + step; + continue; + } + /* new segments: gap or different type */ + print_fixed_last(); + update_fixed_last(base, base + step, *types); + } +} + +static void prepare_set(void); +static void post_set(void); + +static void __init print_mtrr_state(void) +{ + unsigned int i; + int high_width; + + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + pr_debug("MTRR fixed ranges %sabled:\n", + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); + + /* tail */ + print_fixed_last(); + } + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; + + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + pr_debug(" %u disabled\n", i); + } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); +} + +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + +/* Grab all of the MTRR state for this CPU into *state */ +bool __init get_mtrr_state(void) +{ + struct mtrr_var_range *vrs; + unsigned lo, dummy; + unsigned int i; + + vrs = mtrr_state.var_ranges; + + rdmsr(MSR_MTRRcap, lo, dummy); + mtrr_state.have_fixed = (lo >> 8) & 1; + + for (i = 0; i < num_var_ranges; i++) + get_mtrr_var_range(i, &vrs[i]); + if (mtrr_state.have_fixed) + get_fixed_ranges(mtrr_state.fixed_ranges); + + rdmsr(MSR_MTRRdefType, lo, dummy); + mtrr_state.def_type = (lo & 0xff); + mtrr_state.enabled = (lo & 0xc00) >> 10; + + if (amd_special_default_mtrr()) { + unsigned low, high; + + /* TOP_MEM2 */ + rdmsr(MSR_K8_TOP_MEM2, low, high); + mtrr_tom2 = high; + mtrr_tom2 <<= 32; + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff800000ULL; + } + + print_mtrr_state(); + + mtrr_state_set = 1; + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); +} + +/* Some BIOS's are messed up and don't set all MTRRs the same! */ +void __init mtrr_state_warn(void) +{ + unsigned long mask = smp_changes_mask; + + if (!mask) + return; + if (mask & MTRR_CHANGE_MASK_FIXED) + pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_VARIABLE) + pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_DEFTYPE) + pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + + pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); + pr_info("mtrr: corrected configuration.\n"); +} + +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ +void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) +{ + if (wrmsr_safe(msr, a, b) < 0) { + pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + smp_processor_id(), msr, a, b); + } +} + +/** + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = true; + } +} + +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +static void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + u32 mask_lo, mask_hi, base_lo, base_hi; + unsigned int hi; + u64 tmp, mask; + + /* + * get_mtrr doesn't need to update mtrr_state, also it could be called + * from any cpu, so try to print it out directly. + */ + get_cpu(); + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + + if ((mask_lo & 0x800) == 0) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; + goto out_put_cpu; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + + /* Work out the shifted address mask: */ + tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask = size_or_mask | tmp; + + /* Expand tmp with high bits to all 1s: */ + hi = fls64(tmp); + if (hi > 0) { + tmp |= ~((1ULL<<(hi - 1)) - 1); + + if (tmp != mask) { + pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + mask = tmp; + } + } + + /* + * This works correctly if size is a power of two, i.e. a + * contiguous range: + */ + *size = -mask; + *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & 0xff; + +out_put_cpu: + put_cpu(); +} + +/** + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ +static int set_fixed_ranges(mtrr_type *frs) +{ + unsigned long long *saved = (unsigned long long *)frs; + bool changed = false; + int block = -1, range; + + k8_check_syscfg_dram_mod_en(); + + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } + + return changed; +} + +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +{ + unsigned int lo, hi; + bool changed = false; + + rdmsr(MTRRphysBase_MSR(index), lo, hi); + if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) + || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != + (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + changed = true; + } + + rdmsr(MTRRphysMask_MSR(index), lo, hi); + + if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) + || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != + (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); + changed = true; + } + return changed; +} + +static u32 deftype_lo, deftype_hi; + +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ +static unsigned long set_mtrr_state(void) +{ + unsigned long change_mask = 0; + unsigned int i; + + for (i = 0; i < num_var_ranges; i++) { + if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) + change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } + + if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) + change_mask |= MTRR_CHANGE_MASK_FIXED; + + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ + if ((deftype_lo & 0xff) != mtrr_state.def_type + || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); + change_mask |= MTRR_CHANGE_MASK_DEFTYPE; + } + + return change_mask; +} + + +static unsigned long cr4; +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); + +/* + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. + */ +static void prepare_set(void) __acquires(set_atomicity_lock) +{ + unsigned long cr0; + + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ + + raw_spin_lock(&set_atomicity_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | X86_CR0_CD; + write_cr0(cr0); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + /* Save MTRR state */ + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + +static void post_set(void) __releases(set_atomicity_lock) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + flush_tlb_local(); + + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (boot_cpu_has(X86_FEATURE_PGE)) + __write_cr4(cr4); + raw_spin_unlock(&set_atomicity_lock); +} + +static void generic_set_all(void) +{ + unsigned long mask, count; + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + /* Actually set the state */ + mask = set_mtrr_state(); + + /* also set PAT */ + pat_init(); + + post_set(); + local_irq_restore(flags); + + /* Use the atomic bitops to update the global mask */ + for (count = 0; count < sizeof(mask) * 8; ++count) { + if (mask & 0x01) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } + +} + +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ +static void generic_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long flags; + struct mtrr_var_range *vr; + + vr = &mtrr_state.var_ranges[reg]; + + local_irq_save(flags); + prepare_set(); + + if (size == 0) { + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ + mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + memset(vr, 0, sizeof(struct mtrr_var_range)); + } else { + vr->base_lo = base << PAGE_SHIFT | type; + vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); + vr->mask_lo = -size << PAGE_SHIFT | 0x800; + vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + + mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); + mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); + } + + post_set(); + local_irq_restore(flags); +} + +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) +{ + unsigned long lbase, last; + + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ + if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_stepping <= 7) { + if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + return -EINVAL; + } + if (!(base + size < 0x70000 || base > 0x7003F) && + (type == MTRR_TYPE_WRCOMB + || type == MTRR_TYPE_WRBACK)) { + pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + return -EINVAL; + } + } + + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ + last = base + size - 1; + for (lbase = base; !(lbase & 1) && (last & 1); + lbase = lbase >> 1, last = last >> 1) + ; + if (lbase != last) { + pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); + return -EINVAL; + } + return 0; +} + +static int generic_have_wrcomb(void) +{ + unsigned long config, dummy; + rdmsr(MSR_MTRRcap, config, dummy); + return config & (1 << 10); +} + +int positive_have_wrcomb(void) +{ + return 1; +} + +/* + * Generic structure... + */ +const struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c new file mode 100644 index 000000000..a5c506f6d --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINE_SIZE 80 + +#include + +#include "mtrr.h" + +#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) + +static const char *const mtrr_strings[MTRR_NUM_TYPES] = +{ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ +}; + +const char *mtrr_attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +#ifdef CONFIG_PROC_FS + +static int +mtrr_file_add(unsigned long base, unsigned long size, + unsigned int type, bool increment, struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg, max; + + max = num_var_ranges; + if (fcount == NULL) { + fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL); + if (!fcount) + return -ENOMEM; + FILE_FCOUNT(file) = fcount; + } + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_add_page(base, size, type, true); + if (reg >= 0) + ++fcount[reg]; + return reg; +} + +static int +mtrr_file_del(unsigned long base, unsigned long size, + struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg; + + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_del_page(-1, base, size); + if (reg < 0) + return reg; + if (fcount == NULL) + return reg; + if (fcount[reg] < 1) + return -EINVAL; + --fcount[reg]; + return reg; +} + +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ +static ssize_t +mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) +{ + int i, err; + unsigned long reg; + unsigned long long base, size; + char *ptr; + char line[LINE_SIZE]; + int length; + size_t linelen; + + memset(line, 0, LINE_SIZE); + + len = min_t(size_t, len, LINE_SIZE - 1); + length = strncpy_from_user(line, buf, len); + if (length < 0) + return length; + + linelen = strlen(line); + ptr = line + linelen - 1; + if (linelen && *ptr == '\n') + *ptr = '\0'; + + if (!strncmp(line, "disable=", 8)) { + reg = simple_strtoul(line + 8, &ptr, 0); + err = mtrr_del_page(reg, 0, 0); + if (err < 0) + return err; + return len; + } + + if (strncmp(line, "base=", 5)) + return -EINVAL; + + base = simple_strtoull(line + 5, &ptr, 0); + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "size=", 5)) + return -EINVAL; + + size = simple_strtoull(ptr + 5, &ptr, 0); + if ((base & 0xfff) || (size & 0xfff)) + return -EINVAL; + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "type=", 5)) + return -EINVAL; + ptr = skip_spaces(ptr + 5); + + i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr); + if (i < 0) + return i; + + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); + if (err < 0) + return err; + return len; +} + +static long +mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) +{ + int err = 0; + mtrr_type type; + unsigned long base; + unsigned long size; + struct mtrr_sentry sentry; + struct mtrr_gentry gentry; + void __user *arg = (void __user *) __arg; + + memset(&gentry, 0, sizeof(gentry)); + + switch (cmd) { + case MTRRIOC_ADD_ENTRY: + case MTRRIOC_SET_ENTRY: + case MTRRIOC_DEL_ENTRY: + case MTRRIOC_KILL_ENTRY: + case MTRRIOC_ADD_PAGE_ENTRY: + case MTRRIOC_SET_PAGE_ENTRY: + case MTRRIOC_DEL_PAGE_ENTRY: + case MTRRIOC_KILL_PAGE_ENTRY: + if (copy_from_user(&sentry, arg, sizeof(sentry))) + return -EFAULT; + break; + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_from_user(&gentry, arg, sizeof(gentry))) + return -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: + case MTRRIOC32_SET_ENTRY: + case MTRRIOC32_DEL_ENTRY: + case MTRRIOC32_KILL_ENTRY: + case MTRRIOC32_ADD_PAGE_ENTRY: + case MTRRIOC32_SET_PAGE_ENTRY: + case MTRRIOC32_DEL_PAGE_ENTRY: + case MTRRIOC32_KILL_PAGE_ENTRY: { + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; + err = get_user(sentry.base, &s32->base); + err |= get_user(sentry.size, &s32->size); + err |= get_user(sentry.type, &s32->type); + if (err) + return err; + break; + } + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = get_user(gentry.regnum, &g32->regnum); + err |= get_user(gentry.base, &g32->base); + err |= get_user(gentry.size, &g32->size); + err |= get_user(gentry.type, &g32->type); + if (err) + return err; + break; + } +#endif + } + + switch (cmd) { + default: + return -ENOTTY; + case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 0); + break; + case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif + err = mtrr_file_del(sentry.base, sentry.size, file, 0); + break; + case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif + err = mtrr_del(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + + /* Hide entries that go above 4GB */ + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base << PAGE_SHIFT; + gentry.size = size << PAGE_SHIFT; + gentry.type = type; + } + + break; + case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 1); + break; + case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif + err = mtrr_file_del(sentry.base, sentry.size, file, 1); + break; + case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif + err = mtrr_del_page(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + /* Hide entries that would overflow */ + if (size != (__typeof__(gentry.size))size) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base; + gentry.size = size; + gentry.type = type; + } + break; + } + + if (err) + return err; + + switch (cmd) { + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_to_user(arg, &gentry, sizeof(gentry))) + err = -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = put_user(gentry.base, &g32->base); + err |= put_user(gentry.size, &g32->size); + err |= put_user(gentry.regnum, &g32->regnum); + err |= put_user(gentry.type, &g32->type); + break; + } +#endif + } + return err; +} + +static int mtrr_close(struct inode *ino, struct file *file) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int i, max; + + if (fcount != NULL) { + max = num_var_ranges; + for (i = 0; i < max; ++i) { + while (fcount[i] > 0) { + mtrr_del(i, 0, 0); + --fcount[i]; + } + } + kfree(fcount); + FILE_FCOUNT(file) = NULL; + } + return single_release(ino, file); +} + +static int mtrr_seq_show(struct seq_file *seq, void *offset) +{ + char factor; + int i, max; + mtrr_type type; + unsigned long base, size; + + max = num_var_ranges; + for (i = 0; i < max; i++) { + mtrr_if->get(i, &base, &size, &type); + if (size == 0) { + mtrr_usage_table[i] = 0; + continue; + } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), + size, factor, + mtrr_usage_table[i], mtrr_attrib_to_str(type)); + } + return 0; +} + +static int mtrr_open(struct inode *inode, struct file *file) +{ + if (!mtrr_if) + return -EIO; + if (!mtrr_if->get) + return -ENXIO; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return single_open(file, mtrr_seq_show, NULL); +} + +static const struct proc_ops mtrr_proc_ops = { + .proc_open = mtrr_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = mtrr_write, + .proc_ioctl = mtrr_ioctl, +#ifdef CONFIG_COMPAT + .proc_compat_ioctl = mtrr_ioctl, +#endif + .proc_release = mtrr_close, +}; + +static int __init mtrr_if_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops); + return 0; +} +arch_initcall(mtrr_if_init); +#endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c new file mode 100644 index 000000000..5f436cb4f --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -0,0 +1,888 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#define DEBUG + +#include /* FIXME: kvm_para.h needs this */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mtrr.h" + +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + +u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} + +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; +static DEFINE_MUTEX(mtrr_mutex); + +u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; + +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; + +const struct mtrr_ops *mtrr_if; + +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +void __init set_mtrr_ops(const struct mtrr_ops *ops) +{ + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +} + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev; + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ + if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ + if (dev->vendor == PCI_VENDOR_ID_INTEL && + dev->device == PCI_DEVICE_ID_INTEL_82451NX) { + pr_info("Intel 450NX MMC detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + pci_dev_put(dev); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; +} + +/* This function returns the number of variable MTRRs */ +static void __init set_num_var_ranges(void) +{ + unsigned long config = 0, dummy; + + if (use_intel()) + rdmsr(MSR_MTRRcap, config, dummy); + else if (is_cpu(AMD) || is_cpu(HYGON)) + config = 2; + else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) + config = 8; + + num_var_ranges = config & 0xff; +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + for (i = 0; i < max; i++) + mtrr_usage_table[i] = 1; +} + +struct set_mtrr_data { + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +/** + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. + * @info: pointer to mtrr configuration data + * + * Returns nothing. + */ +static int mtrr_rendezvous_handler(void *info) +{ + struct set_mtrr_data *data = info; + + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { + mtrr_if->set_all(); + } + return 0; +} + +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Queue work to do the following on all processors: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + unsigned long lbase, lsize; + int i, replace, error; + mtrr_type ltype; + + if (!mtrr_enabled()) + return -ENXIO; + + error = mtrr_if->validate_add_page(base, size, type); + if (error) + return error; + + if (type >= MTRR_NUM_TYPES) { + pr_warn("type: %u invalid\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + pr_warn("your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (!size) { + pr_warn("zero sized request\n"); + return -EINVAL; + } + + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { + pr_warn("base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + replace = -1; + + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + + /* Search for existing MTRR */ + mutex_lock(&mtrr_mutex); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) + continue; + /* + * At this point we know there is some kind of + * overlap/enclosure + */ + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } else if (types_compatible(type, ltype)) + continue; + } + pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (types_compatible(type, ltype)) + continue; + pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); + goto out; + } + if (increment) + ++mtrr_usage_table[i]; + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size, replace); + if (i >= 0) { + set_mtrr_cpuslocked(i, base, size, type); + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; + if (unlikely(replace != i)) { + set_mtrr_cpuslocked(replace, 0, 0, 0); + mtrr_usage_table[replace] = 0; + } + } + } else { + pr_info("no more MTRRs available\n"); + } + error = i; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + pr_warn("size and base must be multiples of 4 kiB\n"); + pr_debug("size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + + if (!mtrr_enabled()) + return -ENODEV; + + max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + mutex_lock(&mtrr_mutex); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + pr_debug("no MTRR for %lx000,%lx000 found\n", + base, size); + goto out; + } + } + if (reg >= max) { + pr_warn("register: %d too big\n", reg); + goto out; + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + pr_warn("MTRR %d not used\n", reg); + goto out; + } + if (mtrr_usage_table[reg] < 1) { + pr_warn("reg: %d has count=0\n", reg); + goto out; + } + if (--mtrr_usage_table[reg] < 1) + set_mtrr_cpuslocked(reg, 0, 0, 0); + error = reg; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (!mtrr_enabled()) + return -ENODEV; + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled() || !mtrr_enabled()) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * arch_phys_wc_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int arch_phys_wc_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(arch_phys_wc_index); + +/* + * HACK ALERT! + * These should be called implicitly, but we can't yet until all the initcall + * stuff is done... + */ +static void __init init_ifs(void) +{ +#ifndef CONFIG_X86_64 + amd_init_mtrr(); + cyrix_init_mtrr(); + centaur_init_mtrr(); +#endif +} + +/* The suspend/resume methods are only for CPU without MTRR. CPU using generic + * MTRR driver doesn't require this + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; + +static int mtrr_save(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + + + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +int __initdata changed_by_mtrr_cleanup; + +#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) +/** + * mtrr_bp_init - initialize mtrrs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + * + */ +void __init mtrr_bp_init(void) +{ + u32 phys_addr; + + init_ifs(); + + phys_addr = 32; + + if (boot_cpu_has(X86_FEATURE_MTRR)) { + mtrr_if = &generic_mtrr_ops; + size_or_mask = SIZE_OR_MASK_BITS(36); + size_and_mask = 0x00f00000; + phys_addr = 36; + + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it too when they extend the address + * bus of the Xeon. + */ + if (cpuid_eax(0x80000000) >= 0x80000008) { + phys_addr = cpuid_eax(0x80000008) & 0xff; + /* CPUID workaround for Intel 0F33/0F34 CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 0xF && + boot_cpu_data.x86_model == 0x3 && + (boot_cpu_data.x86_stepping == 0x3 || + boot_cpu_data.x86_stepping == 0x4)) + phys_addr = 36; + + size_or_mask = SIZE_OR_MASK_BITS(phys_addr); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && + boot_cpu_data.x86 == 6) { + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + phys_addr = 32; + } + } else { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { + /* Pre-Athlon (K6) AMD CPU MTRRs */ + mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CENTAUR: + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { + mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CYRIX: + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { + mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + default: + break; + } + } + + if (mtrr_if) { + __mtrr_enabled = true; + set_num_var_ranges(); + init_table(); + if (use_intel()) { + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); + + if (mtrr_enabled()) + mtrr_bp_pat_init(); + + if (mtrr_cleanup(phys_addr)) { + changed_by_mtrr_cleanup = 1; + mtrr_if->set_all(); + } + } + } + + if (!mtrr_enabled()) { + pr_info("Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } +} + +void mtrr_ap_init(void) +{ + if (!mtrr_enabled()) + return; + + if (!use_intel() || mtrr_aps_delayed_init) + return; + + /* + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes + */ + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); +} + +/** + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + */ +void mtrr_save_state(void) +{ + int first_cpu; + + if (!mtrr_enabled()) + return; + + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); +} + +void set_mtrr_aps_delayed_init(void) +{ + if (!mtrr_enabled()) + return; + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * Delayed MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel() || !mtrr_enabled()) + return; + + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel() || !mtrr_enabled()) + return; + + mtrr_if->set_all(); +} + +static int __init mtrr_init_finialize(void) +{ + if (!mtrr_enabled()) + return 0; + + if (use_intel()) { + if (!changed_by_mtrr_cleanup) + mtrr_state_warn(); + return 0; + } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); + + return 0; +} +subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h new file mode 100644 index 000000000..2ac99e561 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * local MTRR defines. + */ + +#include +#include + +#define MTRR_CHANGE_MASK_FIXED 0x01 +#define MTRR_CHANGE_MASK_VARIABLE 0x02 +#define MTRR_CHANGE_MASK_DEFTYPE 0x04 + +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; + +struct mtrr_ops { + u32 vendor; + u32 use_intel_if; + void (*set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + void (*set_all)(void); + + void (*get)(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type); + int (*get_free_region)(unsigned long base, unsigned long size, + int replace_reg); + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); +}; + +extern int generic_get_free_region(unsigned long base, unsigned long size, + int replace_reg); +extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +extern const struct mtrr_ops generic_mtrr_ops; + +extern int positive_have_wrcomb(void); + +/* library functions for processor-specific routines */ +struct set_mtrr_context { + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; +}; + +void set_mtrr_done(struct set_mtrr_context *ctxt); +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); + +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); +bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); + +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); + +extern u64 size_or_mask, size_and_mask; +extern const struct mtrr_ops *mtrr_if; + +#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) + +extern unsigned int num_var_ranges; +extern u64 mtrr_tom2; +extern struct mtrr_state_type mtrr_state; + +void mtrr_state_warn(void); +const char *mtrr_attrib_to_str(int x); +void mtrr_wrmsr(unsigned, unsigned, unsigned); + +/* CPU specific mtrr init functions */ +int amd_init_mtrr(void); +int cyrix_init_mtrr(void); +int centaur_init_mtrr(void); + +extern int changed_by_mtrr_cleanup; +extern int mtrr_cleanup(unsigned address_bits); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 000000000..a548d9104 --- /dev/null +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * local apic based NMI watchdog for various CPUs. + * + * This file also handles reservation of performance counters for coordination + * with other users (like oprofile). + * + * Note that these events normally don't tick when the CPU idles. This means + * the frequency varies with CPU load. + * + * Original code for K7/P6 written by Keith Owens + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. + * + * It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* + * perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; + return msr - MSR_K7_PERFCTR0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_PERFCTR0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_PERFCTR0; + case 11: + return msr - MSR_KNC_PERFCTR0; + case 15: + return msr - MSR_P4_BPU_PERFCTR0; + } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; + } + return 0; +} + +/* + * converts an msr to an appropriate reservation bit + * returns the bit offset of the event selection register + */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_HYGON: + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; + return msr - MSR_K7_EVNTSEL0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_EVENTSEL0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_EVNTSEL0; + case 11: + return msr - MSR_KNC_EVNTSEL0; + case 15: + return msr - MSR_P4_BSU_ESCR0; + } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; + } + return 0; + +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return !test_bit(counter, perfctr_nmi_owner); +} +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_perfctr_nmi); + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, perfctr_nmi_owner); +} +EXPORT_SYMBOL(release_perfctr_nmi); + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_evntsel_nmi); + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, evntsel_nmi_owner); +} +EXPORT_SYMBOL(release_evntsel_nmi); diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 000000000..fd6ec2aa0 --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ + "", /* tsc invariant mapped to constant_tsc */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ + "proc_feedback", /* processor feedback interface */ + "acc_power", /* accumulated power mechanism */ +}; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c new file mode 100644 index 000000000..4eec8889b --- /dev/null +++ b/arch/x86/kernel/cpu/proc.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "cpu.h" + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES +extern const char * const x86_vmx_flags[NVMXINTS*32]; +#endif + +/* + * Get CPU information for use by the procfs. + */ +static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, + unsigned int cpu) +{ +#ifdef CONFIG_SMP + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", + cpumask_weight(topology_core_cpumask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); +#endif +} + +#ifdef CONFIG_X86_32 +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fdiv_bug\t: %s\n" + "f00f_bug\t: %s\n" + "coma_bug\t: %s\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + c->cpuid_level); +} +#else +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + c->cpuid_level); +} +#endif + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + struct cpuinfo_x86 *c = v; + unsigned int cpu; + int i; + + cpu = c->cpu_index; + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %u\n" + "model name\t: %s\n", + cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_stepping || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_stepping); + else + seq_puts(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); + + if (cpu_has(c, X86_FEATURE_TSC)) { + unsigned int freq = aperfmperf_get_khz(cpu); + + if (!freq) + freq = cpufreq_quick_get(cpu); + if (!freq) + freq = cpu_khz; + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", + freq / 1000, (freq % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size) + seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); + + show_cpuinfo_core(m, c, cpu); + show_cpuinfo_misc(m, c); + + seq_puts(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + +#ifdef CONFIG_X86_VMX_FEATURE_NAMES + if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) { + seq_puts(m, "\nvmx flags\t:"); + for (i = 0; i < 32*NVMXINTS; i++) { + if (test_bit(i, (unsigned long *)c->vmx_capability) && + x86_vmx_flags[i] != NULL) + seq_printf(m, " %s", x86_vmx_flags[i]); + } + } +#endif + + seq_puts(m, "\nbugs\t\t:"); + for (i = 0; i < 32*NBUGINTS; i++) { + unsigned int bug_bit = 32*NCAPINTS + i; + + if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) + seq_printf(m, " %s", x86_bug_flags[i]); + } + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + +#ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); +#endif + seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + seq_puts(m, "power management:"); + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0] ? " " : "", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } + } + + seq_puts(m, "\n\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 000000000..c4be62058 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + */ + +#include +#include +#include + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* + * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. + * Run the instruction a few times as a sanity check. + * If it fails, it is simple to disable RDRAND here. + */ +#define SANITY_CHECK_LOOPS 8 + +#ifdef CONFIG_ARCH_RANDOM +void x86_init_rdrand(struct cpuinfo_x86 *c) +{ + unsigned int changed = 0; + unsigned long tmp, prev; + int i; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; + + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (!rdrand_long(&tmp)) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_warn_once("rdrand: disabled\n"); + return; + } + } + + /* + * Stupid sanity-check whether RDRAND does *actually* generate + * some at least random-looking data. + */ + prev = tmp; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (rdrand_long(&tmp)) { + if (prev != tmp) + changed++; + + prev = tmp; + } + } + + if (WARN_ON_ONCE(!changed)) + pr_emerg( +"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); + +} +#endif diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile new file mode 100644 index 000000000..4a06c37b9 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o +obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o +CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c new file mode 100644 index 000000000..4ccb9039f --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * Vikas Shivappa + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include + +#include +#include +#include "internal.h" + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +/* + * The cached resctrl_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state); + +/* + * Used to store the max resource name width and max resource data width + * to display the schemata in a tabular format + */ +int max_name_width, max_data_width; + +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + +static void +mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); +static void +mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) + +struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = + { + .rid = RDT_RESOURCE_L3, + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = MSR_IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L3DATA] = + { + .rid = RDT_RESOURCE_L3DATA, + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = MSR_IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L3CODE] = + { + .rid = RDT_RESOURCE_L3CODE, + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = MSR_IA32_L3_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 3, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L2] = + { + .rid = RDT_RESOURCE_L2, + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = MSR_IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 1, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L2DATA] = + { + .rid = RDT_RESOURCE_L2DATA, + .name = "L2DATA", + .domains = domain_init(RDT_RESOURCE_L2DATA), + .msr_base = MSR_IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L2CODE] = + { + .rid = RDT_RESOURCE_L2CODE, + .name = "L2CODE", + .domains = domain_init(RDT_RESOURCE_L2CODE), + .msr_base = MSR_IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_MBA] = + { + .rid = RDT_RESOURCE_MBA, + .name = "MB", + .domains = domain_init(RDT_RESOURCE_MBA), + .cache_level = 3, + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, +}; + +static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) +{ + return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; +} + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cach mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline void cache_alloc_hsw_probe(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + return; + + rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; + + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; + + rdt_alloc_capable = true; +} + +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; + + return r->membw.mba_sc; +} + +/* + * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values + * exposed to user interface and the h/w understandable delay values. + * + * The non-linear delay values have the granularity of power of two + * and also the h/w does not guarantee a curve for configured delay + * values vs. actual b/w enforced. + * Hence we need a mapping that is pre calibrated so the user can + * express the memory b/w as a percentage value. + */ +static inline bool rdt_get_mb_table(struct rdt_resource *r) +{ + /* + * There are no Intel SKUs as of now to support non-linear delay. + */ + pr_info("MBA b/w map not implemented for cpu:%d, model:%d", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return false; +} + +static bool __get_mem_config_intel(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx, max_delay; + + cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + max_delay = eax.split.max_delay + 1; + r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; + if (ecx & MBA_IS_LINEAR) { + r->membw.delay_linear = true; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; + } else { + if (!rdt_get_mb_table(r)) + return false; + r->membw.arch_needs_linear = false; + } + r->data_width = 3; + + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + + r->alloc_capable = true; + r->alloc_enabled = true; + + return true; +} + +static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +{ + union cpuid_0x10_3_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->default_ctrl = MAX_MBA_BW_AMD; + + /* AMD does not use delay */ + r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = 0; + r->membw.bw_gran = 1; + /* Max value is 2048, Data width should be 4 in decimal */ + r->data_width = 4; + + r->alloc_capable = true; + r->alloc_enabled = true; + + return true; +} + +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) +{ + union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->cache.cbm_len = eax.split.cbm_len + 1; + r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; + r->data_width = (r->cache.cbm_len + 3) / 4; + r->alloc_capable = true; + r->alloc_enabled = true; +} + +static void rdt_get_cdp_config(int level, int type) +{ + struct rdt_resource *r_l = &rdt_resources_all[level]; + struct rdt_resource *r = &rdt_resources_all[type]; + + r->num_closid = r_l->num_closid / 2; + r->cache.cbm_len = r_l->cache.cbm_len; + r->default_ctrl = r_l->default_ctrl; + r->cache.shareable_bits = r_l->cache.shareable_bits; + r->data_width = (r->cache.cbm_len + 3) / 4; + r->alloc_capable = true; + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + r->alloc_enabled = false; +} + +static void rdt_get_cdp_l3_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA); + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE); +} + +static void rdt_get_cdp_l2_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA); + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); +} + +static void +mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, d->ctrl_val[i]); +} + +/* + * Map the memory b/w percentage value to delay values + * that can be written to QOS_MSRs. + * There are currently no SKUs which support non linear delay values. + */ +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +{ + if (r->membw.delay_linear) + return MAX_MBA_BW - bw; + + pr_warn_once("Non Linear delay-bw map not supported but queried\n"); + return r->default_ctrl; +} + +static void +mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r) +{ + unsigned int i; + + /* Write the delay values for mba. */ + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); +} + +static void +cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +{ + unsigned int i; + + for (i = m->low; i < m->high; i++) + wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); +} + +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + +void rdt_ctrl_update(void *arg) +{ + struct msr_param *m = arg; + struct rdt_resource *r = m->res; + int cpu = smp_processor_id(); + struct rdt_domain *d; + + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; + } + pr_warn_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(-ENODEV); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +{ + int i; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100% + * and the bandwidth in MBps to U32_MAX + */ + for (i = 0; i < r->num_closid; i++, dc++, dm++) { + *dc = r->default_ctrl; + *dm = MBA_MAX_MBPS; + } +} + +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +{ + struct msr_param m; + u32 *dc, *dm; + + dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + if (!dc) + return -ENOMEM; + + dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); + if (!dm) { + kfree(dc); + return -ENOMEM; + } + + d->ctrl_val = dc; + d->mbps_val = dm; + setup_default_ctrlval(r, dc, dm); + + m.low = 0; + m.high = r->num_closid; + r->msr_update(d, &m, r); + return 0; +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Couldn't find cache id for CPU %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + if (r->cache.arch_has_per_cpu_cfg) + rdt_domain_reconfigure_cdp(r); + return; + } + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); + + rdt_domain_reconfigure_cdp(r); + + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { + kfree(d->ctrl_val); + kfree(d->mbps_val); + kfree(d); + return; + } + + list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Couldn't find cache id for CPU %d\n", cpu); + return; + } + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + list_del(&d->list); + if (r->mon_capable && is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + /* + * rdt_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + + kfree(d->ctrl_val); + kfree(d->mbps_val); + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } + } +} + +static void clear_closid_rmid(int cpu) +{ + struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); + + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); +} + +static int resctrl_online_cpu(unsigned int cpu) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid_rmid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + +static int resctrl_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + clear_closid_rmid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * Choose a width for the resource name and resource data based on the + * resource that has widest name and cbm. + */ +static __init void rdt_init_padding(void) +{ + struct rdt_resource *r; + int cl; + + for_each_alloc_capable_rdt_resource(r) { + cl = strlen(r->name); + if (cl > max_name_width) + max_name_width = cl; + + if (r->data_width > max_data_width) + max_data_width = r->data_width; + } +} + +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_L2_CDP, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_mem_config(void) +{ + if (!rdt_cpu_has(X86_FEATURE_MBA)) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return __get_mem_config_intel(&rdt_resources_all[RDT_RESOURCE_MBA]); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return __rdt_get_mem_config_amd(&rdt_resources_all[RDT_RESOURCE_MBA]); + + return false; +} + +static __init bool get_rdt_alloc_resources(void) +{ + bool ret = false; + + if (rdt_alloc_capable) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) + rdt_get_cdp_l3_config(); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); + if (rdt_cpu_has(X86_FEATURE_CDP_L2)) + rdt_get_cdp_l2_config(); + ret = true; + } + + if (get_mem_config()) + ret = true; + + return ret; +} + +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void __check_quirks_intel(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_stepping <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + else + set_rdt_options("!l3cat"); + } +} + +static __init void check_quirks(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + __check_quirks_intel(); +} + +static __init bool get_rdt_resources(void) +{ + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + +static __init void rdt_init_res_defs_intel(void) +{ + struct rdt_resource *r; + + for_each_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_L3 || + r->rid == RDT_RESOURCE_L3DATA || + r->rid == RDT_RESOURCE_L3CODE || + r->rid == RDT_RESOURCE_L2 || + r->rid == RDT_RESOURCE_L2DATA || + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_empty_bitmaps = false; + r->cache.arch_has_per_cpu_cfg = false; + } else if (r->rid == RDT_RESOURCE_MBA) { + r->msr_base = MSR_IA32_MBA_THRTL_BASE; + r->msr_update = mba_wrmsr_intel; + } + } +} + +static __init void rdt_init_res_defs_amd(void) +{ + struct rdt_resource *r; + + for_each_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_L3 || + r->rid == RDT_RESOURCE_L3DATA || + r->rid == RDT_RESOURCE_L3CODE || + r->rid == RDT_RESOURCE_L2 || + r->rid == RDT_RESOURCE_L2DATA || + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_empty_bitmaps = true; + r->cache.arch_has_per_cpu_cfg = true; + } else if (r->rid == RDT_RESOURCE_MBA) { + r->msr_base = MSR_IA32_MBA_BW_BASE; + r->msr_update = mba_wrmsr_amd; + } + } +} + +static __init void rdt_init_res_defs(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + rdt_init_res_defs_intel(); + else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + rdt_init_res_defs_amd(); +} + +static enum cpuhp_state rdt_online; + +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; + } +} + +static int __init resctrl_late_init(void) +{ + struct rdt_resource *r; + int state, ret; + + /* + * Initialize functions(or definitions) that are different + * between vendors here. + */ + rdt_init_res_defs(); + + check_quirks(); + + if (!get_rdt_resources()) + return -ENODEV; + + rdt_init_padding(); + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/resctrl/cat:online:", + resctrl_online_cpu, resctrl_offline_cpu); + if (state < 0) + return state; + + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + rdt_online = state; + + for_each_alloc_capable_rdt_resource(r) + pr_info("%s allocation detected\n", r->name); + + for_each_mon_capable_rdt_resource(r) + pr_info("%s monitoring detected\n", r->name); + + return 0; +} + +late_initcall(resctrl_late_init); + +static void __exit resctrl_exit(void) +{ + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); +} + +__exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c new file mode 100644 index 000000000..c877642e8 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "internal.h" + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return false; + } + + ret = kstrtoul(buf, 10, &bw); + if (ret) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); + return false; + } + + if ((bw < r->membw.min_bw || bw > r->default_ctrl) && + !is_mba_sc(r)) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, + r->membw.min_bw, r->default_ctrl); + return false; + } + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) +{ + unsigned long bw_val; + + if (d->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + return -EINVAL; + } + + if (!bw_validate(data->buf, &bw_val, r)) + return -EINVAL; + d->new_ctrl = bw_val; + d->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. + * For Intel the SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. + */ +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); + return false; + } + + if ((!r->cache.arch_has_empty_bitmaps && val == 0) || + val > r->default_ctrl) { + rdt_last_cmd_puts("Mask out of range\n"); + return false; + } + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); + return false; + } + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in the mask\n", + r->cache.min_cbm_bits); + return false; + } + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) +{ + struct rdtgroup *rdtgrp = data->rdtgrp; + u32 cbm_val; + + if (d->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + return -EINVAL; + } + + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_puts("Overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_puts("Overlaps with other group\n"); + return -EINVAL; + } + } + + d->new_ctrl = cbm_val; + d->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct rdt_resource *r, + struct rdtgroup *rdtgrp) +{ + struct rdt_parse_data data; + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + dom = strim(dom); + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + data.buf = dom; + data.rdtgrp = rdtgrp; + if (r->parse_ctrlval(&data, r, d)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->r = r; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = d->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } + goto next; + } + } + return -EINVAL; +} + +int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + bool mba_sc; + u32 *dc; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + mba_sc = is_mba_sc(r); + list_for_each_entry(d, &r->domains, list) { + dc = !mba_sc ? d->ctrl_val : d->mbps_val; + if (d->have_new_ctrl && d->new_ctrl != dc[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + dc[closid] = d->new_ctrl; + } + } + + /* + * Avoid writing the control msr with control values when + * MBA software controller is enabled + */ + if (cpumask_empty(cpu_mask) || mba_sc) + goto done; + cpu = get_cpu(); + /* Update resource control msr on this CPU if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_ctrl_update(&msr_param); + /* Update resource control msr on other CPUs. */ + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); + put_cpu(); + +done: + free_cpumask_var(cpu_mask); + + return 0; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + + for_each_alloc_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid) + return parse_line(tok, r, rdtgrp); + } + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_domain *dom; + struct rdt_resource *r; + char *tok, *resname; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); + return -ENOENT; + } + rdt_last_cmd_clear(); + + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); + goto out; + } + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); + if (ret) + goto out; + } + + for_each_alloc_enabled_rdt_resource(r) { + ret = update_domains(r, rdtgrp->closid); + if (ret) + goto out; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + +out: + rdtgroup_kn_unlock(of->kn); + cpus_read_unlock(); + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + u32 ctrl_val; + + seq_printf(s, "%*s:", max_name_width, r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + + ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] : + dom->mbps_val[closid]); + seq_printf(s, r->format_str, dom->id, max_data_width, + ctrl_val); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + for_each_alloc_enabled_rdt_resource(r) + seq_printf(s, "%s:uninitialized\n", r->name); + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->r->name, + rdtgrp->plr->d->id, + rdtgrp->plr->cbm); + } + } else { + closid = rdtgrp->closid; + for_each_alloc_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->r = r; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (IS_ERR_OR_NULL(d)) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, r, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h new file mode 100644 index 000000000..f65d3c0db --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -0,0 +1,632 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_RESCTRL_INTERNAL_H +#define _ASM_X86_RESCTRL_INTERNAL_H + +#include +#include +#include +#include + +#define MSR_IA32_L3_QOS_CFG 0xc81 +#define MSR_IA32_L2_QOS_CFG 0xc82 +#define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_IA32_L2_CBM_BASE 0xd10 +#define MSR_IA32_MBA_THRTL_BASE 0xd50 +#define MSR_IA32_MBA_BW_BASE 0xc0000200 + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +#define L3_QOS_CDP_ENABLE 0x01ULL + +#define L2_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH_BASE 24 +#define MBM_OVERFLOW_INTERVAL 1000 +#define MAX_MBA_BW 100u +#define MBA_IS_LINEAR 0x4 +#define MBA_MAX_MBPS U32_MAX +#define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) + + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_resource *r; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int resctrl_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct pseudo_lock_region - pseudo-lock region information + * @r: RDT resource to which this pseudo-locked region + * belongs + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region + */ +struct pseudo_lock_region { + struct rdt_resource *r; + struct rdt_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + * @mode: mode of resource group + * @plr: pseudo-locked region + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + struct pseudo_lock_region *plr; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RF_TOPSHIFT 6 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); +void __exit rdtgroup_exit(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw The most recent bandwidth in MBps + * @delta_bw Difference between the current and previous bandwidth + * @delta_comp Indicates whether to compute the delta_bw + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; + u64 prev_bw_msr; + u32 prev_bw; + u32 delta_bw; + bool delta_comp; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + * @plr: pseudo-locked region (if any) associated with domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 *mbps_val; + u32 new_ctrl; + bool have_new_ctrl; + struct pseudo_lock_region *plr; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. + * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. + * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache + * level has CPU scope. + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; + bool arch_has_sparse_bitmaps; + bool arch_has_empty_bitmaps; + bool arch_has_per_cpu_cfg; +}; + +/** + * enum membw_throttle_mode - System's memory bandwidth throttling mode + * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system + * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core + * always using smallest bandwidth percentage + * assigned to threads, aka "max throttling" + * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread + */ +enum membw_throttle_mode { + THREAD_THROTTLE_UNDEFINED = 0, + THREAD_THROTTLE_MAX, + THREAD_THROTTLE_PER_THREAD, +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Bandwidth throttling mode when threads request + * different memory bandwidths + * @mba_sc: True if MBA software controller(mba_sc) is enabled + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + bool mba_sc; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned int mbm_width; + unsigned long fflags; +}; + +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +extern struct dentry *debugfs_resctrl; + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_last_cmd_clear(void); +void rdt_last_cmd_puts(const char *s); +void rdt_last_cmd_printf(const char *fmt, ...); + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive); +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm); +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); +int rdtgroup_tasks_assigned(struct rdtgroup *r); +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +int rdt_pseudo_lock_init(void); +void rdt_pseudo_lock_release(void); +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); +void closid_free(int closid); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +bool is_mba_sc(struct rdt_resource *r); +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); + +#endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c new file mode 100644 index 000000000..576f16a50 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -0,0 +1,649 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include +#include +#include +#include "internal.h" + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +static unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int resctrl_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= resctrl_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= resctrl_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +{ + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >>= shift; +} + +static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + struct mbm_state *m; + u64 chunks, tval; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + return tval; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because an invalid + * event id would fail the __rmid_read. + */ + return RMID_VAL_ERROR; + } + + if (rr->first) { + memset(m, 0, sizeof(struct mbm_state)); + m->prev_bw_msr = m->prev_msr = tval; + return 0; + } + + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. + */ +static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + struct mbm_state *m = &rr->d->mbm_local[rmid]; + u64 tval, cur_bw, chunks; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); + cur_bw = (chunks * r->mon_scale) >> 20; + + if (m->delta_comp) + m->delta_bw = abs(cur_bw - m->prev_bw); + m->delta_comp = false; + m->prev_bw = cur_bw; + m->prev_bw_msr = tval; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + u64 ret_val; + + rdtgrp = rr->rgrp; + + ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); + + /* + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr) == 0) + ret_val = 0; + } + } + + /* Report error if none of rmid_reads are successful */ + if (ret_val) + rr->val = ret_val; +} + +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid uncecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + u32 cur_bw, delta_bw, user_bw; + struct rdt_resource *r_mba; + struct rdt_domain *dom_mba; + struct list_head *head; + struct rdtgroup *entry; + + if (!is_mbm_local_enabled()) + return; + + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = &dom_mbm->mbm_local[rmid]; + + dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + delta_bw = pmbm_data->delta_bw; + cur_msr_val = dom_mba->ctrl_val[closid]; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cur_bw += cmbm_data->prev_bw; + delta_bw += cmbm_data->delta_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * + * The delta_bw is used when increasing the bandwidth so that we + * dont alternately increase and decrease the control values + * continuously. + * + * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if + * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep + * switching between 90 and 110 continuously if we only check + * cur_bw < user_bw. + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw + delta_bw))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + cur_msr = r_mba->msr_base + closid; + wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); + dom_mba->ctrl_val[closid] = new_msr_val; + + /* + * Delta values are updated dynamically package wise for each + * rdtgrp everytime the throttle MSR changes value. + * + * This is because (1)the increase in bandwidth is not perfectly + * linear and only "approximately" linear even when the hardware + * says it is linear.(2)Also since MBA is a core specific + * mechanism, the delta values vary based on number of cores used + * by the rdtgrp. + */ + pmbm_data->delta_comp = true; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data->delta_comp = true; + } +} + +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.r = r; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + + /* + * Call the MBA software controller only for the + * control groups and when user has enabled + * the software controller explicitly. + */ + if (is_mba_sc(NULL)) + mbm_bw_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = container_of(work, struct rdt_domain, cqm_limbo.work); + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_mon_enable_key)) + goto out_unlock; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = container_of(work, struct rdt_domain, mbm_over.work); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(r, d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(r, d, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_mon_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; + unsigned int cl_size = boot_cpu_data.x86_cache_size; + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + resctrl_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c new file mode 100644 index 000000000..465dce141 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -0,0 +1,1594 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../../events/perf_event.h" /* For X86_CONFIG() */ +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "pseudo_lock_event.h" + +/* + * The bits needed to disable hardware prefetching varies based on the + * platform. During initialization we will discover which bits to use. + */ +static u64 prefetch_disable_bits; + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); +static struct class *pseudo_lock_class; + +/** + * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * + * Capture the list of platforms that have been validated to support + * pseudo-locking. This includes testing to ensure pseudo-locked regions + * with low cache miss rates can be created under variety of load conditions + * as well as that these pseudo-locked regions can maintain their low cache + * miss rates under variety of load conditions for significant lengths of time. + * + * After a platform has been validated to support pseudo-locking its + * hardware prefetch disable bits are included here as they are documented + * in the SDM. + * + * When adding a platform here also add support for its cache events to + * measure_cycles_perf_fn() + * + * Return: + * If platform is supported, the bits to disable hardware prefetchers, 0 + * if platform is not supported. + */ +static u64 get_prefetch_disable_bits(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return 0; + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) + * 2 DCU Hardware Prefetcher Disable (R/W) + * 3 DCU IP Prefetcher Disable (R/W) + * 63:4 Reserved + */ + return 0xF; + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + /* + * SDM defines bits of MSR_MISC_FEATURE_CONTROL register + * as: + * 0 L2 Hardware Prefetcher Disable (R/W) + * 1 Reserved + * 2 DCU Hardware Prefetcher Disable (R/W) + * 63:3 Reserved + */ + return 0x5; + } + + return 0; +} + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->r = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + struct cpu_cacheinfo *ci; + int ret; + int i; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("CPU %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo(plr->cpu); + + plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == plr->r->cache_level) { + plr->line_size = ci->info_list[i].coherency_line_size; + return 0; + } + } + + ret = -1; + rdt_last_cmd_puts("Unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("Unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * pseudo_lock_fn - Load kernel memory into cache + * @_rdtgrp: resource group to which pseudo-lock region belongs + * + * This is the core pseudo-locking flow. + * + * First we ensure that the kernel memory cannot be found in the cache. + * Then, while taking care that there will be as little interference as + * possible, the memory to be loaded is accessed while core is running + * with class of service set to the bitmask of the pseudo-locked region. + * After this is complete no future CAT allocations will be allowed to + * overlap with this bitmask. + * + * Local register variables are utilized to ensure that the memory region + * to be locked is the only memory access made during the critical locking + * loop. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int pseudo_lock_fn(void *_rdtgrp) +{ + struct rdtgroup *rdtgrp = _rdtgrp; + struct pseudo_lock_region *plr = rdtgrp->plr; + u32 rmid_p, closid_p; + unsigned long i; + u64 saved_msr; +#ifdef CONFIG_KASAN + /* + * The registers used for local register variables are also used + * when KASAN is active. When KASAN is active we use a regular + * variable to ensure we always use a valid pointer, but the cost + * is that this variable will enter the cache through evicting the + * memory we are trying to lock into the cache. Thus expect lower + * pseudo-locking success rate when KASAN is active. + */ + unsigned int line_size; + unsigned int size; + void *mem_r; +#else + register unsigned int line_size asm("esi"); + register unsigned int size asm("edi"); + register void *mem_r asm(_ASM_BX); +#endif /* CONFIG_KASAN */ + + /* + * Make sure none of the allocated memory is cached. If it is we + * will get a cache hit in below loop from outside of pseudo-locked + * region. + * wbinvd (as opposed to clflush/clflushopt) is required to + * increase likelihood that allocated cache portion will be filled + * with associated memory. + */ + native_wbinvd(); + + /* + * Always called with interrupts enabled. By disabling interrupts + * ensure that we will not be preempted during this critical section. + */ + local_irq_disable(); + + /* + * Call wrmsr and rdmsr as directly as possible to avoid tracing + * clobbering local register variables or affecting cache accesses. + * + * Disable the hardware prefetcher so that when the end of the memory + * being pseudo-locked is reached the hardware will not read beyond + * the buffer and evict pseudo-locked memory read earlier from the + * cache. + */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); + __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + closid_p = this_cpu_read(pqr_state.cur_closid); + rmid_p = this_cpu_read(pqr_state.cur_rmid); + mem_r = plr->kmem; + size = plr->size; + line_size = plr->line_size; + /* + * Critical section begin: start by writing the closid associated + * with the capacity bitmask of the cache region being + * pseudo-locked followed by reading of kernel memory to load it + * into the cache. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + /* + * Cache was flushed earlier. Now access kernel memory to read it + * into cache region associated with just activated plr->closid. + * Loop over data twice: + * - In first loop the cache region is shared with the page walker + * as it populates the paging structure caches (including TLB). + * - In the second loop the paging structure caches are used and + * cache region is populated with the memory being referenced. + */ + for (i = 0; i < size; i += PAGE_SIZE) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Critical section end: restore closid with capacity bitmask that + * does not overlap with pseudo-locked region. + */ + __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); + + /* Re-enable the hardware prefetcher(s) */ + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + local_irq_enable(); + + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @r: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (rdt_mon_capable) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || + rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + prefetch_disable_bits = get_prefetch_disable_bits(); + if (prefetch_disable_bits == 0) { + rdt_last_cmd_puts("Pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("Monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("Tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @cbm: CBM to test + * + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing + * pseudo-locked region on @d. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) +{ + unsigned int cbm_len; + unsigned long cbm_b; + + if (d->plr) { + cbm_len = d->plr->r->cache.cbm_len; + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +{ + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + struct rdt_domain *d_i; + bool ret = false; + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(d_i, &r->domains, list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * @_plr: pseudo-lock region to measure + * + * There is no deterministic way to test if a memory region is cached. One + * way is to measure how long it takes to read the memory, the speed of + * access is a good way to learn how close to the cpu the data was. Even + * more, if the prefetcher is disabled and the memory is read at a stride + * of half the cache line, then a cache miss will be easy to spot since the + * read of the first half would be significantly slower than the read of + * the second half. + * + * Return: 0. Waiter on waitqueue will be woken on completion. + */ +static int measure_cycles_lat_fn(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; + unsigned long i; + u64 start, end; + void *mem_r; + + local_irq_disable(); + /* + * Disable hardware prefetchers. + */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + mem_r = READ_ONCE(plr->kmem); + /* + * Dummy execute of the time measurement to load the needed + * instructions into the L1 instruction cache. + */ + start = rdtsc_ordered(); + for (i = 0; i < plr->size; i += 32) { + start = rdtsc_ordered(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + end = rdtsc_ordered(); + trace_pseudo_lock_mem_latency((u32)(end - start)); + } + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + local_irq_enable(); + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/* + * Create a perf_event_attr for the hit and miss perf events that will + * be used during the performance measurement. A perf_event maintains + * a pointer to its perf_event_attr so a unique attribute structure is + * created for each perf_event. + * + * The actual configuration of the event is set right before use in order + * to use the X86_CONFIG macro. + */ +static struct perf_event_attr perf_miss_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +static struct perf_event_attr perf_hit_attr = { + .type = PERF_TYPE_RAW, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 0, + .exclude_user = 1, +}; + +struct residency_counts { + u64 miss_before, hits_before; + u64 miss_after, hits_after; +}; + +static int measure_residency_fn(struct perf_event_attr *miss_attr, + struct perf_event_attr *hit_attr, + struct pseudo_lock_region *plr, + struct residency_counts *counts) +{ + u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; + struct perf_event *miss_event, *hit_event; + int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; + unsigned int line_size; + unsigned int size; + unsigned long i; + void *mem_r; + u64 tmp; + + miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(miss_event)) + goto out; + + hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, + NULL, NULL, NULL); + if (IS_ERR(hit_event)) + goto out_miss; + + local_irq_disable(); + /* + * Check any possible error state of events used by performing + * one local read. + */ + if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { + local_irq_enable(); + goto out_hit; + } + + /* + * Disable hardware prefetchers. + */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + + /* Initialize rest of local variables */ + /* + * Performance event has been validated right before this with + * interrupts disabled - it is thus safe to read the counter index. + */ + miss_pmcnum = x86_perf_rdpmc_index(miss_event); + hit_pmcnum = x86_perf_rdpmc_index(hit_event); + line_size = READ_ONCE(plr->line_size); + mem_r = READ_ONCE(plr->kmem); + size = READ_ONCE(plr->size); + + /* + * Read counter variables twice - first to load the instructions + * used in L1 cache, second to capture accurate value that does not + * include cache misses incurred because of instruction loads. + */ + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * From SDM: Performing back-to-back fast reads are not guaranteed + * to be monotonic. + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_before); + rdpmcl(miss_pmcnum, miss_before); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + for (i = 0; i < size; i += line_size) { + /* + * Add a barrier to prevent speculative execution of this + * loop reading beyond the end of the buffer. + */ + rmb(); + asm volatile("mov (%0,%1,1), %%eax\n\t" + : + : "r" (mem_r), "r" (i) + : "%eax", "memory"); + } + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + rdpmcl(hit_pmcnum, hits_after); + rdpmcl(miss_pmcnum, miss_after); + /* + * Use LFENCE to ensure all previous instructions are retired + * before proceeding. + */ + rmb(); + /* Re-enable hardware prefetchers */ + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); + local_irq_enable(); +out_hit: + perf_event_release_kernel(hit_event); +out_miss: + perf_event_release_kernel(miss_event); +out: + /* + * All counts will be zero on failure. + */ + counts->miss_before = miss_before; + counts->hits_before = hits_before; + counts->miss_after = miss_after; + counts->hits_after = hits_after; + return 0; +} + +static int measure_l2_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + + /* + * Non-architectural event for the Goldmont Microarchitecture + * from Intel x86 Architecture Software Developer Manual (SDM): + * MEM_LOAD_UOPS_RETIRED D1H (event number) + * Umask values: + * L2_HIT 02H + * L2_MISS 10H + */ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + perf_miss_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x10); + perf_hit_attr.config = X86_CONFIG(.event = 0xd1, + .umask = 0x2); + break; + default: + goto out; + } + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); + /* + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. + */ + trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, + counts.miss_after - counts.miss_before); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +static int measure_l3_residency(void *_plr) +{ + struct pseudo_lock_region *plr = _plr; + struct residency_counts counts = {0}; + + /* + * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event + * has two "no fix" errata associated with it: BDM35 and BDM100. On + * this platform the following events are used instead: + * LONGEST_LAT_CACHE 2EH (Documented in SDM) + * REFERENCE 4FH + * MISS 41H + */ + + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_BROADWELL_X: + /* On BDW the hit event counts references, not hits */ + perf_hit_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x4f); + perf_miss_attr.config = X86_CONFIG(.event = 0x2e, + .umask = 0x41); + break; + default: + goto out; + } + + measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); + /* + * If a failure prevented the measurements from succeeding + * tracepoints will still be written and all counts will be zero. + */ + + counts.miss_after -= counts.miss_before; + if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + /* + * On BDW references and misses are counted, need to adjust. + * Sometimes the "hits" counter is a bit more than the + * references, for example, x references but x + 1 hits. + * To not report invalid hit values in this case we treat + * that as misses equal to references. + */ + /* First compute the number of cache references measured */ + counts.hits_after -= counts.hits_before; + /* Next convert references to cache hits */ + counts.hits_after -= min(counts.miss_after, counts.hits_after); + } else { + counts.hits_after -= counts.hits_before; + } + + trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); +out: + plr->thread_done = 1; + wake_up_interruptible(&plr->lock_thread_wq); + return 0; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + if (!plr->d) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + plr->cpu = cpu; + + if (sel == 1) + thread = kthread_create_on_node(measure_cycles_lat_fn, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 2) + thread = kthread_create_on_node(measure_l2_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else if (sel == 3) + thread = kthread_create_on_node(measure_l3_residency, plr, + cpu_to_node(cpu), + "pseudo_lock_measure/%u", + cpu); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + kthread_bind(thread, cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2 && sel != 3) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + + plr->thread_done = 0; + + thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, + cpu_to_node(plr->cpu), + "pseudo_lock/%u", plr->cpu); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); + goto out_cstates; + } + + kthread_bind(thread, plr->cpu); + wake_up_process(thread); + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will derefence + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("Locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, + debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", rdtgrp->kn->name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("Failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus assymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +static char *pseudo_lock_devnode(struct device *dev, umode_t *mode) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock"); + if (IS_ERR(pseudo_lock_class)) { + ret = PTR_ERR(pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + pseudo_lock_class->devnode = pseudo_lock_devnode; + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_destroy(pseudo_lock_class); + pseudo_lock_class = NULL; + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h new file mode 100644 index 000000000..428ebbd42 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PSEUDO_LOCK_H + +#include + +TRACE_EVENT(pseudo_lock_mem_latency, + TP_PROTO(u32 latency), + TP_ARGS(latency), + TP_STRUCT__entry(__field(u32, latency)), + TP_fast_assign(__entry->latency = latency), + TP_printk("latency=%u", __entry->latency) + ); + +TRACE_EVENT(pseudo_lock_l2, + TP_PROTO(u64 l2_hits, u64 l2_miss), + TP_ARGS(l2_hits, l2_miss), + TP_STRUCT__entry(__field(u64, l2_hits) + __field(u64, l2_miss)), + TP_fast_assign(__entry->l2_hits = l2_hits; + __entry->l2_miss = l2_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l2_hits, __entry->l2_miss)); + +TRACE_EVENT(pseudo_lock_l3, + TP_PROTO(u64 l3_hits, u64 l3_miss), + TP_ARGS(l3_hits, l3_miss), + TP_STRUCT__entry(__field(u64, l3_hits) + __field(u64, l3_miss)), + TP_fast_assign(__entry->l3_hits = l3_hits; + __entry->l3_miss = l3_miss;), + TP_printk("hits=%llu miss=%llu", + __entry->l3_hits, __entry->l3_miss)); + +#endif /* _TRACE_PSEUDO_LOCK_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE pseudo_lock_event +#include diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c new file mode 100644 index 000000000..1e73b6fae --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -0,0 +1,3263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * User interface for Resource Alloction in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + +static struct seq_buf last_cmd_status; +static char last_cmd_status_buf[512]; + +struct dentry *debugfs_resctrl; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} + +static void closid_init(void) +{ + struct rdt_resource *r; + int rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + for_each_alloc_enabled_rdt_resource(r) + rdt_min_closid = min(rdt_min_closid, r->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; +} + +static int closid_alloc(void) +{ + u32 closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +static bool closid_allocated(unsigned int closid) +{ + return (closid_free_map & (1 << closid)) == 0; +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct cpumask *mask; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * This is safe against resctrl_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from update_closid_rmid() is proteced against __switch_to() because + * preemption is disabled. + */ +static void update_cpu_closid_rmid(void *info) +{ + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + resctrl_sched_in(current); +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids/rmids must have been set up before calling this function. + */ +static void +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, cpu_mask)) + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); + put_cpu(); +} + +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); + return -EINVAL; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); + return -EINVAL; + } + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + + if (ret) { + rdt_last_cmd_puts("Bad CPU list/mask\n"); + goto unlock; + } + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (cpumask_weight(tmpmask)) { + ret = -EINVAL; + rdt_last_cmd_puts("Can only assign online CPUs\n"); + goto unlock; + } + + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); + + return ret ?: nbytes; +} + +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + +static void _update_task_closid_rmid(void *task) +{ + /* + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. + */ + if (task == current) + resctrl_sched_in(task); +} + +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; + + /* + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + + if (rdtgrp->type == RDTCTRL_GROUP) { + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; + } + } + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_sched_in() during context switch. + */ + smp_mb(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); + ret = -EPERM; + } + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + +unlock: + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + pid_t pid; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +#ifdef CONFIG_PROC_CPU_RESCTRL + +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!static_branch_unlikely(&rdt_enable_key)) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (rdtg->closid != tsk->closid) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdtg->kn->name); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (tsk->rmid != crg->mon.rmid) + continue; + seq_printf(s, "%s", crg->kn->name); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->default_ctrl); + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + +/** + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 *ctrl; + + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_putc(seq, ';'); + ctrl = dom->ctrl_val; + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->id); + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (!closid_allocated(i)) + continue; + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= *ctrl; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= *ctrl; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.min_bw); + return 0; +} + +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + +static int rdt_bw_gran_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", r->membw.delay_linear); + return 0; +} + +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale); + + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + resctrl_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +/** + * rdt_cdp_peer_get - Retrieve CDP peer if it exists + * @r: RDT resource to which RDT domain @d belongs + * @d: Cache instance for which a CDP peer is requested + * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer) + * Used to return the result. + * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer) + * Used to return the result. + * + * RDT resources are managed independently and by extension the RDT domains + * (RDT resource instances) are managed independently also. The Code and + * Data Prioritization (CDP) RDT resources, while managed independently, + * could refer to the same underlying hardware. For example, + * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache. + * + * When provided with an RDT resource @r and an instance of that RDT + * resource @d rdt_cdp_peer_get() will return if there is a peer RDT + * resource and the exact instance that shares the same hardware. + * + * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists. + * If a CDP peer was found, @r_cdp will point to the peer RDT resource + * and @d_cdp will point to the peer RDT domain. + */ +static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, + struct rdt_resource **r_cdp, + struct rdt_domain **d_cdp) +{ + struct rdt_resource *_r_cdp = NULL; + struct rdt_domain *_d_cdp = NULL; + int ret = 0; + + switch (r->rid) { + case RDT_RESOURCE_L3DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + break; + case RDT_RESOURCE_L3CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + break; + case RDT_RESOURCE_L2DATA: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE]; + break; + case RDT_RESOURCE_L2CODE: + _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA]; + break; + default: + ret = -ENOENT; + goto out; + } + + /* + * When a new CPU comes online and CDP is enabled then the new + * RDT domains (if any) associated with both CDP RDT resources + * are added in the same CPU online routine while the + * rdtgroup_mutex is held. It should thus not happen for one + * RDT domain to exist and be associated with its RDT CDP + * resource but there is no RDT domain associated with the + * peer RDT CDP resource. Hence the WARN. + */ + _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); + if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) { + _r_cdp = NULL; + _d_cdp = NULL; + ret = -EINVAL; + } + +out: + *r_cdp = _r_cdp; + *d_cdp = _d_cdp; + + return ret; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + enum rdtgrp_mode mode; + unsigned long ctrl_b; + u32 *ctrl; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + ctrl_b = *ctrl; + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + struct rdt_resource *r_cdp; + struct rdt_domain *d_cdp; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive)) + return true; + + if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0) + return false; + + return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive); +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_resource *r; + bool has_cache = false; + struct rdt_domain *d; + + for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; + list_for_each_entry(d, &r->domains, list) { + if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], + rdtgrp->closid, false)) { + rdt_last_cmd_puts("Schemata overlaps\n"); + return false; + } + } + } + + if (!has_cache) { + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); + return false; + } + + return true; +} + +/** + * rdtgroup_mode_write - Modify the resource group's mode + * + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (!strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_puts("Unknown or unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_domain *d, unsigned long cbm) +{ + struct cpu_cacheinfo *ci; + unsigned int size = 0; + int num_b, i; + + num_b = bitmap_weight(&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == r->cache_level) { + size = ci->info_list[i].size / r->cache.cbm_len * num_b; + break; + } + } + + return size; +} + +/** + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + * + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct rdt_domain *d; + unsigned int size; + int ret = 0; + bool sep; + u32 ctrl; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->r->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + } + goto out; + } + + for_each_alloc_enabled_rdt_resource(r) { + sep = false; + seq_printf(s, "%*s:", max_name_width, r->name); + list_for_each_entry(d, &r->domains, list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); + } + seq_printf(s, "%d=%u", d->id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RF_TOP_INFO, + }, + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "shareable_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, + }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RF_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RF_CTRL_BASE, + }, + +}; + +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) +{ + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; +} + +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) +{ + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + + ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + if (ret) + goto out_destroy; + + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); + if (ret) + goto out_destroy; + } + + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); + if (ret) + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static void l2_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); +} + +static inline bool is_mba_linear(void) +{ + return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear; +} + +static int set_cache_qos_cfg(int level, bool enable) +{ + void (*update)(void *arg); + struct rdt_resource *r_l; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (level == RDT_RESOURCE_L3) + update = l3_qos_cfg_update; + else if (level == RDT_RESOURCE_L2) + update = l2_qos_cfg_update; + else + return -EINVAL; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + r_l = &rdt_resources_all[level]; + list_for_each_entry(d, &r_l->domains, list) { + if (r_l->cache.arch_has_per_cpu_cfg) + /* Pick all the CPUs in the domain instance */ + for_each_cpu(cpu, &d->cpu_mask) + cpumask_set_cpu(cpu, cpu_mask); + else + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + cpu = get_cpu(); + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + update(&enable); + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, update, &enable, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* Restore the qos cfg state when a domain comes online */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r) +{ + if (!r->alloc_capable) + return; + + if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA]) + l2_qos_cfg_update(&r->alloc_enabled); + + if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA]) + l3_qos_cfg_update(&r->alloc_enabled); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; + struct rdt_domain *d; + + if (!is_mbm_enabled() || !is_mba_linear() || + mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + list_for_each_entry(d, &r->domains, list) + setup_default_ctrlval(r, d->ctrl_val, d->mbps_val); + + return 0; +} + +static int cdp_enable(int level, int data_type, int code_type) +{ + struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; + struct rdt_resource *r_lcode = &rdt_resources_all[code_type]; + struct rdt_resource *r_l = &rdt_resources_all[level]; + int ret; + + if (!r_l->alloc_capable || !r_ldata->alloc_capable || + !r_lcode->alloc_capable) + return -EINVAL; + + ret = set_cache_qos_cfg(level, true); + if (!ret) { + r_l->alloc_enabled = false; + r_ldata->alloc_enabled = true; + r_lcode->alloc_enabled = true; + } + return ret; +} + +static int cdpl3_enable(void) +{ + return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE); +} + +static int cdpl2_enable(void) +{ + return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE); +} + +static void cdp_disable(int level, int data_type, int code_type) +{ + struct rdt_resource *r = &rdt_resources_all[level]; + + r->alloc_enabled = r->alloc_capable; + + if (rdt_resources_all[data_type].alloc_enabled) { + rdt_resources_all[data_type].alloc_enabled = false; + rdt_resources_all[code_type].alloc_enabled = false; + set_cache_qos_cfg(level, false); + } +} + +static void cdpl3_disable(void) +{ + cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE); +} + +static void cdpl2_disable(void) +{ + cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE); +} + +static void cdp_disable_all(void) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) + cdpl3_disable(); + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + cdpl2_disable(); +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { + return kn->parent->priv; + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) + ret = cdpl2_enable(); + + if (!ret && ctx->enable_cdpl3) + ret = cdpl3_enable(); + + if (!ret && ctx->enable_mba_mbps) + ret = set_mba_sc(true); + + return ret; +} + +static int rdt_get_tree(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct rdt_domain *dom; + struct rdt_resource *r; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + ret = -EBUSY; + goto out; + } + + ret = rdt_enable_ctx(ctx); + if (ret < 0) + goto out_cdp; + + closid_init(); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret < 0) + goto out_mba; + + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret < 0) + goto out_mongrp; + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + + ret = rdt_pseudo_lock_init(); + if (ret) + goto out_mondata; + + ret = kernfs_get_tree(fc); + if (ret < 0) + goto out_psl; + + if (rdt_alloc_capable) + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable_cpuslocked(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } + + goto out; + +out_psl: + rdt_pseudo_lock_release(); +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: + kernfs_remove(kn_info); +out_mba: + if (ctx->enable_mba_mbps) + set_mba_sc(false); +out_cdp: + cdp_disable_all(); +out: + rdt_last_cmd_clear(); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mbps, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_fs_parameters[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + {} +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mbps: + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -EINVAL; + ctx->enable_mba_mbps = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.root = rdt_root; + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; +} + +static int reset_all_ctrls(struct rdt_resource *r) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i, cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = r->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < r->num_closid; i++) + d->ctrl_val[i] = r->default_ctrl; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_ctrl_update(&msr_param); + /* Update CBM on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and resctrl_sched_in() + * during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but + * there is no other side effect. + */ + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) + cpumask_set_cpu(task_cpu(t), mask); + } + } + read_unlock(&tasklist_lock); +} + +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + rdtgroup_remove(sentry); + } +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + free_rmid(rdtgrp->mon.rmid); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + rdtgroup_remove(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); + + kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + set_mba_sc(false); + + /*Put everything back to default values. */ + for_each_alloc_enabled_rdt_resource(r) + reset_all_ctrls(r); + cdp_disable_all(); + rmdir_all_sub(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_disable_cpuslocked(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = rdt_fs_parameters, + .kill_sb = rdt_kill_sb, +}; + +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) +{ + struct kernfs_node *kn; + int ret = 0; + + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon__. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) +{ + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + unsigned long val = _val; + + if (!val) + return 0; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; +} + +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) +{ + struct rdt_resource *r_cdp = NULL; + struct rdt_domain *d_cdp = NULL; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, *ctrl; + int i; + + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + int ret; + + for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); + if (ret < 0) + return ret; + } + + ret = update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Failed to initialize allocations\n"); + return ret; + } + + } + + rdtgrp->mode = RDT_MODE_SHAREABLE; + + return 0; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; + } + + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto out_unlock; + } + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + rdt_last_cmd_puts("Kernel out of memory\n"); + goto out_unlock; + } + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); + goto out_free_rgrp; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); + goto out_destroy; + } + + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); + goto out_destroy; + } + + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + goto out_destroy; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_idfree; + } + } + kernfs_activate(kn); + + /* + * The caller unlocks the parent_kn upon success. + */ + return 0; + +out_idfree: + free_rmid(rdtgrp->mon.rmid); +out_destroy: + kernfs_put(rdtgrp->kn); + kernfs_remove(rdtgrp->kn); +out_free_rgrp: + kfree(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + rdtgroup_remove(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp; + struct kernfs_node *kn; + u32 closid; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); + if (ret) + return ret; + + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) { + rdt_last_cmd_puts("Out of CLOSIDs\n"); + goto out_common_fail; + } + closid = ret; + ret = 0; + + rdtgrp->closid = closid; + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_id_free; + + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; + } + } + + goto out_unlock; + +out_del_list: + list_del(&rdtgrp->rdtgroup_list); +out_id_free: + closid_free(closid); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_ctrl_remove(struct kernfs_node *kn, + struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + kernfs_remove(rdtgrp->kn); + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + rdtgroup_ctrl_remove(kn, rdtgrp); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(kn, rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) { + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + } else { + ret = -EPERM; + } + +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) + seq_puts(seq, ",cdp"); + + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA])) + seq_puts(seq, ",mba_MBps"); + + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .show_options = rdtgroup_show_options, +}; + +static int __init rdtgroup_setup_root(void) +{ + int ret; + + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + + rdtgroup_default.kn = rdt_root->kn; + kernfs_activate(rdtgroup_default.kn); + +out: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} + +void __exit rdtgroup_exit(void) +{ + debugfs_remove_recursive(debugfs_resctrl); + unregister_filesystem(&rdt_fs_type); + sysfs_remove_mount_point(fs_kobj, "resctrl"); + kernfs_destroy_root(rdt_root); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000..f1cd1b6fb --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,68 @@ +/* + * Routines to identify additional cpu features that are scattered in + * cpuid space. + */ +#include + +#include +#include +#include + +#include "cpu.h" + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +/* + * Please keep the leaf sorted by cpuid_bit.level for faster search. + * X86_FEATURE_MBA is supported by both Intel and AMD. But the CPUID + * levels are different and there is a separate entry for each. + */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { 0, 0, 0, 0, 0 } +}; + +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 000000000..58d17c01d --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include +#include +#include +#include + +#include "cpu.h" + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* extended topology sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define DIE_TYPE 5 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +unsigned int __max_die_per_package __read_mostly = 1; +EXPORT_SYMBOL(__max_die_per_package); + +#ifdef CONFIG_SMP +/* + * Check if given CPUID extended toplogy "leaf" is implemented + */ +static int check_extended_topology_leaf(int leaf) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return -1; + + return 0; +} +/* + * Return best CPUID Extended Toplogy Leaf supported + */ +static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +{ + if (c->cpuid_level >= 0x1f) { + if (check_extended_topology_leaf(0x1f) == 0) + return 0x1f; + } + + if (c->cpuid_level >= 0xb) { + if (check_extended_topology_leaf(0xb) == 0) + return 0xb; + } + + return -1; +} +#endif + +int detect_extended_topology_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) + return -1; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); +#endif + return 0; +} + +/* + * Check for extended topology enumeration cpuid leaf, and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +int detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; + bool die_level_present = false; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) + return -1; + + /* + * Populate HT related information from sub-leaf level 0. + */ + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + c->initial_apicid = edx; + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + while (true) { + cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = core_level_siblings; + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_present = true; + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + + sub_index++; + } + + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; + die_select_mask = (~(-1 << die_plus_mask_width)) >> + core_plus_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + ht_mask_width) & core_select_mask; + + if (die_level_present) { + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + } + + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, + pkg_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + __max_die_per_package = (die_level_siblings / core_level_siblings); +#endif + return 0; +} diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c new file mode 100644 index 000000000..42c939827 --- /dev/null +++ b/arch/x86/kernel/cpu/transmeta.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "cpu.h" + +static void early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); + } +} + +static void init_transmeta(struct cpuinfo_x86 *c) +{ + unsigned int cap_mask, uk, max, dummy; + unsigned int cms_rev1, cms_rev2; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; + char cpu_info[65]; + + early_init_transmeta(c); + + cpu_detect_cache_sizes(c); + + /* Print CMS and CPU revision */ + max = cpuid_eax(0x80860000); + cpu_rev = 0; + if (max >= 0x80860001) { + cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); + if (cpu_rev != 0x02000000) { + pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + (cpu_rev >> 24) & 0xff, + (cpu_rev >> 16) & 0xff, + (cpu_rev >> 8) & 0xff, + cpu_rev & 0xff, + cpu_freq); + } + } + if (max >= 0x80860002) { + cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); + if (cpu_rev == 0x02000000) { + pr_info("CPU: Processor revision %08X, %u MHz\n", + new_cpu_rev, cpu_freq); + } + pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + (cms_rev1 >> 24) & 0xff, + (cms_rev1 >> 16) & 0xff, + (cms_rev1 >> 8) & 0xff, + cms_rev1 & 0xff, + cms_rev2); + } + if (max >= 0x80860006) { + cpuid(0x80860003, + (void *)&cpu_info[0], + (void *)&cpu_info[4], + (void *)&cpu_info[8], + (void *)&cpu_info[12]); + cpuid(0x80860004, + (void *)&cpu_info[16], + (void *)&cpu_info[20], + (void *)&cpu_info[24], + (void *)&cpu_info[28]); + cpuid(0x80860005, + (void *)&cpu_info[32], + (void *)&cpu_info[36], + (void *)&cpu_info[40], + (void *)&cpu_info[44]); + cpuid(0x80860006, + (void *)&cpu_info[48], + (void *)&cpu_info[52], + (void *)&cpu_info[56], + (void *)&cpu_info[60]); + cpu_info[64] = '\0'; + pr_info("CPU: %s\n", cpu_info); + } + + /* Unhide possibly hidden capability flags */ + rdmsr(0x80860004, cap_mask, uk); + wrmsr(0x80860004, ~0, uk); + c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001); + wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_SYSCTL + /* + * randomize_va_space slows us down enormously; + * it probably triggers retranslation of x86->native bytecode + */ + randomize_va_space = 0; +#endif +} + +static const struct cpu_dev transmeta_cpu_dev = { + .c_vendor = "Transmeta", + .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, + .c_init = init_transmeta, + .c_x86_vendor = X86_VENDOR_TRANSMETA, +}; + +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000..da06bbb5d --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta + */ + +#include + +#include + +#include "cpu.h" + +#undef pr_fmt +#define pr_fmt(fmt) "tsx: " fmt + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + if (!(x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR)) + return; + + setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL); + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the RTM and HLE CPUID + * bits. Clear them here since they are now expected to be not + * set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the RTM and HLE CPUID + * bits. Force them here since they are now expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + setup_force_cpu_cap(X86_FEATURE_HLE); + } +} diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c new file mode 100644 index 000000000..65a58a390 --- /dev/null +++ b/arch/x86/kernel/cpu/umc.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "cpu.h" + +/* + * UMC chips appear to be only either 386 or 486, + * so no special init takes place. + */ + +static const struct cpu_dev umc_cpu_dev = { + .c_vendor = "UMC", + .c_ident = { "UMC UMC UMC" }, + .legacy_models = { + { .family = 4, .model_names = + { + [1] = "U5D", + [2] = "U5S", + } + }, + }, + .c_x86_vendor = X86_VENDOR_UMC, +}; + +cpu_dev_register(umc_cpu_dev); + diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000..ec8064c0a --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* + * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_umwait_control_cached __ro_after_init; + +/* + * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(umwait_lock); + +static void umwait_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of umwait_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int umwait_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + umwait_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int umwait_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_umwait_control_cached is never changed after it caches + * the original control MSR value in umwait_init(). So there + * is no race condition here. + */ + wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + umwait_update_control_msr(NULL); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool umwait_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE); +} + +static inline u32 umwait_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; +} + +static inline void umwait_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE; + + WRITE_ONCE(umwait_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(umwait_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (c02_enable != umwait_ctrl_c02_enabled(ctrl)) + umwait_update_control(ctrl, c02_enable); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (max_time != umwait_ctrl_max_time(ctrl)) + umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *umwait_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group umwait_attr_group = { + .attrs = umwait_attrs, + .name = "umwait_control", +}; + +static int __init umwait_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_umwait_control_cached + * is modified. + */ + rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, umwait_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&umwait_syscore_ops); + + /* + * Add umwait control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &umwait_attr_group); +} +device_initcall(umwait_init); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 000000000..924571fe5 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,525 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_FEATURES_LEAF 0x40000010 +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 + +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_LEGACY_X2APIC 3 +#define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx), %%eax" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ + switch (vmware_hypercall_mode) { \ + case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ + VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ + VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + default: \ + VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ + break; \ + } \ + } while (0) + +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + +static unsigned long vmware_tsc_khz __ro_after_init; +static u8 vmware_hypercall_mode __ro_after_init; + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long vmware_get_tsc_khz(void) +{ + return vmware_tsc_khz; +} + +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static bool vmw_sched_clock __initdata = true; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64); +static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ + +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = false; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + +static unsigned long long notrace vmware_sched_clock(void) +{ + unsigned long long ns; + + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; +} + +static void __init vmware_cyc2ns_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); +} + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_ops.cpu.io_delay = paravirt_nop; + + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + pv_ops.time.sched_clock = vmware_sched_clock; + + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + pv_ops.time.steal_clock = vmware_steal_clock; + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void __init vmware_set_capabilities(void) +{ + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) + setup_force_cpu_cap(X86_FEATURE_VMCALL); + else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) + setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL); +} + +static void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; + + VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + +#ifdef CONFIG_X86_LOCAL_APIC + /* Skip lapic calibration since we know the bus frequency. */ + lapic_timer_period = ecx / HZ; + pr_info("Host bus clock speed read from hypervisor : %u Hz\n", + ecx); +#endif + } else { + pr_warn("Failed to get TSC freq from the hypervisor\n"); + } + + vmware_paravirt_ops_setup(); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + + vmware_set_capabilities(); +} + +static u8 __init vmware_select_hypercall(void) +{ + int eax, ebx, ecx, edx; + + cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx); + return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL | + CPUID_VMWARE_FEATURES_ECX_VMCALL)); +} + +/* + * While checking the dmi string information, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode + * intentionally defaults to 0. + */ +static uint32_t __init vmware_platform(void) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + unsigned int eax; + unsigned int hyper_vendor_id[3]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], + &hyper_vendor_id[1], &hyper_vendor_id[2]); + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) { + if (eax >= CPUID_VMWARE_FEATURES_LEAF) + vmware_hypercall_mode = + vmware_select_hypercall(); + + pr_info("hypercall mode: 0x%02x\n", + (unsigned int) vmware_hypercall_mode); + + return CPUID_VMWARE_INFO_LEAF; + } + } else if (dmi_available && dmi_name_in_serial("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, + struct pt_regs *regs) +{ + /* Copy VMWARE specific Hypercall parameters to the GHCB */ + ghcb_set_rip(ghcb, regs->ip); + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); + ghcb_set_rdi(ghcb, regs->di); + ghcb_set_rbp(ghcb, regs->bp); +} + +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + if (!(ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb) && + ghcb_rsi_is_valid(ghcb) && + ghcb_rdi_is_valid(ghcb) && + ghcb_rbp_is_valid(ghcb))) + return false; + + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + regs->si = ghcb->save.rsi; + regs->di = ghcb->save.rdi; + regs->bp = ghcb->save.rbp; + + return true; +} +#endif + +const __initconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish, +#endif +}; diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000..05fa4ef63 --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + init_ia32_feat_ctl(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c new file mode 100644 index 000000000..3492aa36b --- /dev/null +++ b/arch/x86/kernel/cpuid.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * x86 CPUID access device + * + * This device is accessed by lseek() to the appropriate CPUID level + * and then read in chunks of 16 bytes. A larger size means multiple + * reads of consecutive levels. + * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * + * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static struct class *cpuid_class; +static enum cpuhp_state cpuhp_cpuid_state; + +struct cpuid_regs_done { + struct cpuid_regs regs; + struct completion done; +}; + +static void cpuid_smp_cpuid(void *cmd_block) +{ + struct cpuid_regs_done *cmd = cmd_block; + + cpuid_count(cmd->regs.eax, cmd->regs.ecx, + &cmd->regs.eax, &cmd->regs.ebx, + &cmd->regs.ecx, &cmd->regs.edx); + + complete(&cmd->done); +} + +static ssize_t cpuid_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char __user *tmp = buf; + struct cpuid_regs_done cmd; + int cpu = iminor(file_inode(file)); + u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; + + if (count % 16) + return -EINVAL; /* Invalid chunk size */ + + init_completion(&cmd.done); + for (; count; count -= 16) { + call_single_data_t csd = { + .func = cpuid_smp_cpuid, + .info = &cmd, + }; + + cmd.regs.eax = pos; + cmd.regs.ecx = pos >> 32; + + err = smp_call_function_single_async(cpu, &csd); + if (err) + break; + wait_for_completion(&cmd.done); + if (copy_to_user(tmp, &cmd.regs, 16)) { + err = -EFAULT; + break; + } + tmp += 16; + bytes += 16; + *ppos = ++pos; + reinit_completion(&cmd.done); + } + + return bytes ? bytes : err; +} + +static int cpuid_open(struct inode *inode, struct file *file) +{ + unsigned int cpu; + struct cpuinfo_x86 *c; + + cpu = iminor(file_inode(file)); + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (c->cpuid_level < 0) + return -EIO; /* CPUID not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations cpuid_fops = { + .owner = THIS_MODULE, + .llseek = no_seek_end_llseek, + .read = cpuid_read, + .open = cpuid_open, +}; + +static int cpuid_device_create(unsigned int cpu) +{ + struct device *dev; + + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); + return PTR_ERR_OR_ZERO(dev); +} + +static int cpuid_device_destroy(unsigned int cpu) +{ + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; +} + +static char *cpuid_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static int __init cpuid_init(void) +{ + int err; + + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { + printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", + CPUID_MAJOR); + return -EBUSY; + } + cpuid_class = class_create(THIS_MODULE, "cpuid"); + if (IS_ERR(cpuid_class)) { + err = PTR_ERR(cpuid_class); + goto out_chrdev; + } + cpuid_class->devnode = cpuid_devnode; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; + + cpuhp_cpuid_state = err; + return 0; + +out_class: + class_destroy(cpuid_class); +out_chrdev: + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); + return err; +} +module_init(cpuid_init); + +static void __exit cpuid_exit(void) +{ + cpuhp_remove_state(cpuhp_cpuid_state); + class_destroy(cpuid_class); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); +} +module_exit(cpuid_exit); + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic CPUID driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c new file mode 100644 index 000000000..a932a07d0 --- /dev/null +++ b/arch/x86/kernel/crash.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific (i386/x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal + * + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + +/* + * When the crashkernel option is specified, only use the low + * 1M for the real mode trampoline. + */ +void __init crash_reserve_low_1M(void) +{ + if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) + return; + + memblock_reserve(0, 1<<20); + pr_info("Reserving the low 1M of memory for crashkernel\n"); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +static void kdump_nmi_callback(int cpu, struct pt_regs *regs) +{ + crash_save_cpu(regs, cpu); + + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + + disable_local_APIC(); +} + +void kdump_nmi_shootdown_cpus(void) +{ + nmi_shootdown_cpus(kdump_nmi_callback); + + disable_local_APIC(); +} + +/* Override the weak function in kernel/panic.c */ +void crash_smp_send_stop(void) +{ + static int cpus_stopped; + + if (cpus_stopped) + return; + + if (smp_ops.crash_stop_other_cpus) + smp_ops.crash_stop_other_cpus(); + else + smp_send_stop(); + + cpus_stopped = 1; +} + +#else +void crash_smp_send_stop(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void native_machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + crash_smp_send_stop(); + + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + + cpu_emergency_disable_virtualization(); + + /* + * Disable Intel PT to stop its logging + */ + cpu_emergency_stop_pt(); + +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); + clear_IO_APIC(); +#endif + lapic_shutdown(); + restore_boot_irq_mode(); +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); +} + +#ifdef CONFIG_KEXEC_FILE + +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) +{ + unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +/* Gather all the required information to prepare elf headers for ram regions */ +static struct crash_mem *fill_up_crash_elf_data(void) +{ + unsigned int nr_ranges = 0; + struct crash_mem *cmem; + + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + if (!nr_ranges) + return NULL; + + /* + * Exclusion of crash region and/or crashk_low_res may cause + * another range split. So add extra two slots here. + */ + nr_ranges += 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); + if (!cmem) + return NULL; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + + return cmem; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in cmem->ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_mem *cmem) +{ + int ret = 0; + + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1); + if (ret) + return ret; + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + if (crashk_low_res.end) + ret = crash_exclude_mem_range(cmem, crashk_low_res.start, + crashk_low_res.end); + + return ret; +} + +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) +{ + struct crash_mem *cmem = arg; + + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; + + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_mem *cmem; + int ret; + + cmem = fill_up_crash_elf_data(); + if (!cmem) + return -ENOMEM; + + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(cmem); + if (ret) + goto out; + + /* By default prepare 64bit headers */ + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); + +out: + vfree(cmem); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) + return 1; + + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(struct resource *res, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820_entry ei; + + ei.addr = res->start; + ei.size = resource_size(res); + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return crash_exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820_entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(struct_size(cmem, ranges, 1)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); + + /* Add ACPI tables */ + cmd.type = E820_TYPE_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_TYPE_NVS; + walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add e820 reserved ranges */ + cmd.type = E820_TYPE_RESERVED; + flags = IORESOURCE_MEM; + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = resource_size(&crashk_low_res); + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +int crash_load_segments(struct kimage *image) +{ + int ret; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); + if (ret) + return ret; + + image->arch.elf_headers = kbuf.buffer; + image->arch.elf_headers_sz = kbuf.bufsz; + + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + image->arch.elf_load_addr = kbuf.mem; + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); + + return ret; +} +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c new file mode 100644 index 000000000..8a89c109e --- /dev/null +++ b/arch/x86/kernel/crash_core_32.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X86_PAE + VMCOREINFO_CONFIG(X86_PAE); +#endif +} diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c new file mode 100644 index 000000000..7d255f882 --- /dev/null +++ b/arch/x86/kernel/crash_core_64.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include + +void arch_crash_save_vmcoreinfo(void) +{ + u64 sme_mask = sme_me_mask; + + VMCOREINFO_NUMBER(phys_base); + VMCOREINFO_SYMBOL(init_top_pgt); + vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", + pgtable_l5_enabled()); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_NUMBER(sme_mask); +} diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c new file mode 100644 index 000000000..33ee47670 --- /dev/null +++ b/arch/x86/kernel/crash_dump_32.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include +#include +#include + +#include + +static void *kdump_buf_page; + +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + * + * Calling copy_to_user() in atomic context is not desirable. Hence first + * copying the data to a pre-allocated kernel page and then copying to user + * space in non-atomic context. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + + vaddr = kmap_atomic_pfn(pfn); + + if (!userbuf) { + memcpy(buf, (vaddr + offset), csize); + kunmap_atomic(vaddr); + } else { + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Kdump buffer page not" + " allocated\n"); + kunmap_atomic(vaddr); + return -EFAULT; + } + copy_page(kdump_buf_page, vaddr); + kunmap_atomic(vaddr); + if (copy_to_user(buf, (kdump_buf_page + offset), csize)) + return -EFAULT; + } + + return csize; +} + +static int __init kdump_buf_page_init(void) +{ + int ret = 0; + + kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" + " page\n"); + ret = -ENOMEM; + } + + return ret; +} +arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c new file mode 100644 index 000000000..045e82e89 --- /dev/null +++ b/arch/x86/kernel/crash_dump_64.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include +#include +#include + +static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf, + bool encrypted) +{ + void *vaddr; + + if (!csize) + return 0; + + if (encrypted) + vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); + else + vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { + iounmap((void __iomem *)vaddr); + return -EFAULT; + } + } else + memcpy(buf, vaddr + offset, csize); + + set_iounmap_nonlazy(); + iounmap((void __iomem *)vaddr); + return csize; +} + +/** + * copy_oldmem_page - copy one page of memory + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from the old kernel's memory. For this page, there is no pte + * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); +} + +/** + * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the + * memory with the encryption mask set to accommodate kdump on SME-enabled + * machines. + */ +ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); +} + +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0, sev_active()); +} diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 000000000..ddffd80f5 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Architecture specific OF callbacks. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; + +int __initdata of_ioapic; + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +device_initcall(add_bus_probe); + +#ifdef CONFIG_PCI +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} + +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + virq = of_irq_parse_and_map_pci(dev, 0, 0); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void x86_of_pci_init(void) +{ + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +#ifdef CONFIG_X86_LOCAL_APIC + +static void __init dtb_cpu_setup(void) +{ + struct device_node *dn; + u32 apic_id, version; + int ret; + + version = GET_APIC_VERSION(apic_read(APIC_LVR)); + for_each_of_cpu_node(dn) { + ret = of_property_read_u32(dn, "reg", &apic_id); + if (ret < 0) { + pr_warn("%pOF: missing local APIC ID\n", dn); + continue; + } + generic_processor_info(apic_id, version); + } +} + +static void __init dtb_lapic_setup(void) +{ + struct device_node *dn; + struct resource r; + unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (dn) { + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + lapic_addr = r.start; + } + + /* Did the boot loader setup the local APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + if (apic_force_enable(lapic_addr)) + return; + } + smp_found_config = 1; + pic_mode = 1; + register_lapic_address(lapic_addr); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = (struct irq_fwspec *)arg; + struct of_ioapic_type *it; + struct irq_alloc_info tmp; + int type_index; + + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + + type_index = fwspec->param[1]; + if (type_index >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = &of_ioapic_type[type_index]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic.pin = fwspec->param[0]; + + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); +} + +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &ioapic_irq_domain_ops, + .dev = dn, + }; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + dtb_lapic_setup(); + dtb_cpu_setup(); +#endif + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_EARLY_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *dt; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); + if (map_len < size) { + early_memunmap(dt, map_len); + dt = early_memremap(initial_dtb, size); + map_len = size; + } + + early_init_dt_verify(dt); + unflatten_and_copy_device_tree(); + early_memunmap(dt, map_len); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000..759d392cb --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) + +static void set_df_gdt_entry(unsigned int cpu); + +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage noinstr void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; + + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); + + cr2 = native_read_cr2(); + + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); + + trace_hardirqs_off(); + + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from exc_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + exc_double_fault(®s, 0, cr2); + + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); +} + +DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { + .tss = { + /* + * No sp0 or ss0 -- we never run CPL != 0 with this TSS + * active. sp is filled in later. + */ + .ldt = 0, + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + + .ip = (unsigned long) asm_exc_double_fault, + .flags = X86_EFLAGS_FIXED, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif + + .__cr3 = __pa_nodebug(swapper_pg_dir), + }, +}; + +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + +void doublefault_init_cpu_tss(void) +{ + unsigned int cpu = smp_processor_id(); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + /* + * The linker isn't smart enough to initialize percpu variables that + * point to other places in percpu space. + */ + this_cpu_write(doublefault_stack.tss.sp, + (unsigned long)&cea->doublefault_stack.stack + + sizeof(doublefault_stack.stack)); + + set_df_gdt_entry(cpu); +} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 000000000..b9736aac2 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; +static int die_counter; + +static struct pt_regs exec_summary_regs; + +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) +{ + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + +static void printk_stack_address(unsigned long address, int reliable, + const char *log_lvl) +{ + touch_nmi_watchdog(); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); +} + +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* The user space code from other tasks cannot be accessed. */ + if (regs != task_pt_regs(current)) + return -EPERM; + /* + * Make sure userspace isn't trying to trick us into dumping kernel + * memory by pointing the userspace instruction pointer at it. + */ + if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) + return -EINVAL; + + /* + * Even if named copy_from_user_nmi() this can be invoked from + * other contexts and will not try to resolve a pagefault, which is + * the correct thing to do here as this code can be called from any + * context. + */ + return copy_from_user_nmi(buf, (void __user *)src, nbytes); +} + +/* + * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: + * + * In case where we don't have the exact kernel image (which, if we did, we can + * simply disassemble and navigate to the RIP), the purpose of the bigger + * prologue is to have more context and to be able to correlate the code from + * the different toolchains better. + * + * In addition, it helps in recreating the register allocation of the failing + * kernel and thus make sense of the register dump. + * + * What is more, the additional complication of a variable length insn arch like + * x86 warrants having longer byte sequence before rIP so that the disassembler + * can "sync" up properly and find instruction boundaries when decoding the + * opcode bytes. + * + * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random + * guesstimate in attempt to achieve all of the above. + */ +void show_opcodes(struct pt_regs *regs, const char *loglvl) +{ +#define PROLOGUE_SIZE 42 +#define EPILOGUE_SIZE 21 +#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) + u8 opcodes[OPCODE_BUFSIZE]; + unsigned long prologue = regs->ip - PROLOGUE_SIZE; + + switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { + case 0: + printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" + __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, + opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); + break; + case -EPERM: + /* No access to the user space stack of other tasks. Ignore. */ + break; + default: + printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + loglvl, prologue); + break; + } +} + +void show_ip(struct pt_regs *regs, const char *loglvl) +{ +#ifdef CONFIG_X86_32 + printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); +#else + printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); +#endif + show_opcodes(regs, loglvl); +} + +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) +{ + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial, const char *log_lvl) +{ + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs, log_lvl); + } +} + +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) +{ + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; + bool partial = false; + + printk("%sCall Trace:\n", log_lvl); + + unwind_start(&state, task, regs, stack); + regs = unwind_get_entry_regs(&state, &partial); + + /* + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * - entry stack + * + * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack + * - entry stack + */ + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { + const char *stack_name; + + stack = PTR_ALIGN(stack, sizeof(long)); + + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } + + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); + + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = READ_ONCE_NOCHECK(*stack); + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + /* + * Don't print regs->ip again if it was already printed + * by show_regs_if_on_stack(). + */ + if (regs && stack == ®s->ip) + goto next; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + +next: + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state, &partial); + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); + } + + if (stack_name) + printk("%s \n", log_lvl, stack_name); + } +} + +void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl) +{ + task = task ? : current; + + /* + * Stack frames below this one aren't interesting. Don't show them + * if we're printing for %current. + */ + if (!sp && task == current) + sp = get_stack_pointer(current, NULL); + + show_trace_log_lvl(task, NULL, sp, loglvl); +} + +void show_stack_regs(struct pt_regs *regs) +{ + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); +} + +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!arch_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + arch_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} +NOKPROBE_SYMBOL(oops_begin); + +void __noreturn rewind_stack_and_make_dead(int signr); + +void oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + arch_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + /* Executive summary in case the oops scrolled away */ + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + * Before we rewind the stack, we have to tell KASAN that we're going to + * reuse the task stack and that existing poisons are invalid. + */ + kasan_unpoison_task_stack(current); + rewind_stack_and_make_dead(signr); +} +NOKPROBE_SYMBOL(oops_end); + +static void __die_header(const char *str, struct pt_regs *regs, long err) +{ + const char *pr = ""; + + /* Save the regs of the first oops for the executive summary later. */ + if (!die_counter) + exec_summary_regs = *regs; + + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + + printk(KERN_DEFAULT + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, + pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); +} +NOKPROBE_SYMBOL(__die_header); + +static int __die_body(const char *str, struct pt_regs *regs, long err) +{ + show_regs(regs); + print_modules(); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) + return 1; + + return 0; +} +NOKPROBE_SYMBOL(__die_body); + +int __die(const char *str, struct pt_regs *regs, long err) +{ + __die_header(str, regs, err); + return __die_body(str, regs, err); +} +NOKPROBE_SYMBOL(__die); + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + __die_header(str, regs, err); + if (gp_addr) + kasan_non_canonical_hook(gp_addr); + if (__die_body(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void show_regs(struct pt_regs *regs) +{ + enum show_regs_mode print_kernel_regs; + + show_regs_print_info(KERN_DEFAULT); + + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); + + /* + * When in-kernel, we also print out the stack at the time of the fault.. + */ + if (!user_mode(regs)) + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); +} diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000..722fd712e --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +const char *stack_type_name(enum stack_type type) +{ + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; + + if (type == STACK_TYPE_EXCEPTION) + return "#DF"; + + return NULL; +} + +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} + +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; + + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} + +static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) +{ + struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); + struct doublefault_stack *ss = &cea->doublefault_stack; + + void *begin = ss->stack; + void *end = begin + sizeof(ss->stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_EXCEPTION; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); + + return true; +} + + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; + + task = task ? : current; + + if (in_task_stack(stack, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + if (in_entry_stack(stack, info)) + goto recursion_check; + + if (in_hardirq_stack(stack, info)) + goto recursion_check; + + if (in_softirq_stack(stack, info)) + goto recursion_check; + + if (in_doublefault_stack(stack, info)) + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; +} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000..1dd851397 --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", + [ ESTACK_VC ] = "#VC", + [ ESTACK_VC2 ] = "#VC2", +}; + +const char *stack_type_name(enum stack_type type) +{ + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; +} + +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB), + EPAGERANGE(MCE), + EPAGERANGE(VC), + EPAGERANGE(VC2), +}; + +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; + struct pt_regs *regs; + unsigned int k; + + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; + + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; + + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; + + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; +} + +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); + + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); + + return true; +} + +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + if (in_task_stack(stack, task, info)) + return true; + + if (task != current) + return false; + + if (in_exception_stack(stack, info)) + return true; + + if (in_irq_stack(stack, info)) + return true; + + if (in_entry_stack(stack, info)) + return true; + + return false; +} + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + task = task ? : current; + + if (!stack) + goto unknown; + + if (!get_stack_info_noinstr(stack, task, info)) + goto unknown; + + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) { + if (task == current) + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; +} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 000000000..7f57110f9 --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,1350 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Low level x86 E820 memory map handling functions. + * + * The firmware and bootloader passes us the "E820 table", which is the primary + * physical memory layout description available about x86 systems. + * + * The kernel takes the E820 memory layout and optionally modifies it with + * quirks and other tweaks, and feeds that into the generic Linux memory + * allocation code routines via a platform independent interface (memblock, etc.). + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * We organize the E820 table into three main data structures: + * + * - 'e820_table_firmware': the original firmware version passed to us by the + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: + * + * - inform the user about the firmware's notion of memory layout + * via /sys/firmware/memmap + * + * - the hibernation code uses it to generate a kernel-independent CRC32 + * checksum of the physical memory layout of a system. + * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * + * - kexec, which is a bootloader in disguise, uses the original E820 + * layout to pass to the kexec-ed kernel. This way the original kernel + * can have a restricted E820 map while the kexec()-ed kexec-kernel + * can have access to full memory - etc. + * + * - 'e820_table': this is the main E820 table that is massaged by the + * low level x86 platform code, or modified by boot parameters, before + * passed on to higher level MM layers. + * + * Once the E820 map has been converted to the standard Linux memory layout + * information its role stops - modifying it has no effect and does not get + * re-propagated. So itsmain role is a temporary bootstrap storage of firmware + * specific memory layout data during early bootup. + */ +static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; +static struct e820_table e820_table_firmware_init __initdata; + +struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; +struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +static bool _e820__mapped_any(struct e820_table *table, + u64 start, u64 end, enum e820_type type) +{ + int i; + + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; + + if (type && entry->type != type) + continue; + if (entry->addr >= end || entry->addr + entry->size <= start) + continue; + return true; + } + return false; +} + +bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table_firmware, start, end, type); +} +EXPORT_SYMBOL_GPL(e820__mapped_raw_any); + +bool e820__mapped_any(u64 start, u64 end, enum e820_type type) +{ + return _e820__mapped_any(e820_table, start, end, type); +} +EXPORT_SYMBOL_GPL(e820__mapped_any); + +/* + * This function checks if the entire range is mapped with 'type'. + * + * Note: this function only works correctly once the E820 table is sorted and + * not-overlapping (at least for the range specified), which is the case normally. + */ +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (type && entry->type != type) + continue; + + /* Is the region (part) in overlap with the current region? */ + if (entry->addr >= end || entry->addr + entry->size <= start) + continue; + + /* + * If the region is at the beginning of we move + * 'start' to the end of the region since it's ok until there + */ + if (entry->addr <= start) + start = entry->addr + entry->size; + + /* + * If 'start' is now at or beyond 'end', we're done, full + * coverage of the desired range exists: + */ + if (start >= end) + return entry; + } + + return NULL; +} + +/* + * This function checks if the entire range is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range . + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; +} + +/* + * Add a memory region to the kernel E820 map. + */ +static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) +{ + int x = table->nr_entries; + + if (x >= ARRAY_SIZE(table->entries)) { + pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", + start, start + size - 1); + return; + } + + table->entries[x].addr = start; + table->entries[x].size = size; + table->entries[x].type = type; + table->nr_entries++; +} + +void __init e820__range_add(u64 start, u64 size, enum e820_type type) +{ + __e820__range_add(e820_table, start, size, type); +} + +static void __init e820_print_type(enum e820_type type) +{ + switch (type) { + case E820_TYPE_RAM: /* Fall through: */ + case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RESERVED: pr_cont("reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; + case E820_TYPE_ACPI: pr_cont("ACPI data"); break; + case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_PMEM: /* Fall through: */ + case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; + default: pr_cont("type %u", type); break; + } +} + +void __init e820__print_table(char *who) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + pr_info("%s: [mem %#018Lx-%#018Lx] ", + who, + e820_table->entries[i].addr, + e820_table->entries[i].addr + e820_table->entries[i].size - 1); + + e820_print_type(e820_table->entries[i].type); + pr_cont("\n"); + } +} + +/* + * Sanitize an E820 map. + * + * Some E820 layouts include overlapping entries. The following + * replaces the original E820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. + * + * The input parameter 'entries' points to an array of 'struct + * e820_entry' which on entry has elements in the range [0, *nr_entries) + * valid, and which has space for up to max_nr_entries entries. + * On return, the resulting sanitized E820 map entries will be in + * overwritten in the same location, starting at 'entries'. + * + * The integer pointed to by nr_entries must be valid on entry (the + * current number of valid entries located at 'entries'). If the + * sanitizing succeeds the *nr_entries will be updated with the new + * number of valid entries (something no more than max_nr_entries). + * + * The return value from e820__update_table() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ + */ +struct change_member { + /* Pointer to the original entry: */ + struct e820_entry *entry; + /* Address for this change point: */ + unsigned long long addr; +}; + +static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; +static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; +static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; +static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are not equal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); +} + +static bool e820_nomerge(enum e820_type type) +{ + /* + * These types may indicate distinct platform ranges aligned to + * numa node, protection domain, performance domain, or other + * boundaries. Do not merge them. + */ + if (type == E820_TYPE_PRAM) + return true; + if (type == E820_TYPE_SOFT_RESERVED) + return true; + return false; +} + +int __init e820__update_table(struct e820_table *table) +{ + struct e820_entry *entries = table->entries; + u32 max_nr_entries = ARRAY_SIZE(table->entries); + enum e820_type current_type, last_type; + unsigned long long last_addr; + u32 new_nr_entries, overlap_entries; + u32 i, chg_idx, chg_nr; + + /* If there's only one memory region, don't bother: */ + if (table->nr_entries < 2) + return -1; + + BUG_ON(table->nr_entries > max_nr_entries); + + /* Bail out if we find any unreasonable addresses in the map: */ + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].addr + entries[i].size < entries[i].addr) + return -1; + } + + /* Create pointers for initial change-point information (for sorting): */ + for (i = 0; i < 2 * table->nr_entries; i++) + change_point[i] = &change_point_list[i]; + + /* + * Record all known change-points (starting and ending addresses), + * omitting empty memory regions: + */ + chg_idx = 0; + for (i = 0; i < table->nr_entries; i++) { + if (entries[i].size != 0) { + change_point[chg_idx]->addr = entries[i].addr; + change_point[chg_idx++]->entry = &entries[i]; + change_point[chg_idx]->addr = entries[i].addr + entries[i].size; + change_point[chg_idx++]->entry = &entries[i]; + } + } + chg_nr = chg_idx; + + /* Sort change-point list by memory addresses (low -> high): */ + sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL); + + /* Create a new memory map, removing overlaps: */ + overlap_entries = 0; /* Number of entries in the overlap table */ + new_nr_entries = 0; /* Index for creating new map entries */ + last_type = 0; /* Start with undefined memory type */ + last_addr = 0; /* Start with 0 as last starting address */ + + /* Loop through change-points, determining effect on the new map: */ + for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) { + /* Keep track of all overlapping entries */ + if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) { + /* Add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++] = change_point[chg_idx]->entry; + } else { + /* Remove entry from list (order independent, so swap with last): */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == change_point[chg_idx]->entry) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * If there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + } + + /* Continue building up new map based on this information: */ + if (current_type != last_type || e820_nomerge(current_type)) { + if (last_type != 0) { + new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; + /* Move forward only if the new size was non-zero: */ + if (new_entries[new_nr_entries].size != 0) + /* No more space left for new entries? */ + if (++new_nr_entries >= max_nr_entries) + break; + } + if (current_type != 0) { + new_entries[new_nr_entries].addr = change_point[chg_idx]->addr; + new_entries[new_nr_entries].type = current_type; + last_addr = change_point[chg_idx]->addr; + } + last_type = current_type; + } + } + + /* Copy the new entries into the original location: */ + memcpy(entries, new_entries, new_nr_entries*sizeof(*entries)); + table->nr_entries = new_nr_entries; + + return 0; +} + +static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +{ + struct boot_e820_entry *entry = entries; + + while (nr_entries) { + u64 start = entry->addr; + u64 size = entry->size; + u64 end = start + size - 1; + u32 type = entry->type; + + /* Ignore the entry on 64-bit overflow: */ + if (start > end && likely(size)) + return -1; + + e820__range_add(start, size, type); + + entry++; + nr_entries--; + } + return 0; +} + +/* + * Copy the BIOS E820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_entries < 2) + return -1; + + return __append_e820_table(entries, nr_entries); +} + +static u64 __init +__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + u64 end; + unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); + e820_print_type(old_type); + pr_cont(" ==> "); + e820_print_type(new_type); + pr_cont("\n"); + + for (i = 0; i < table->nr_entries; i++) { + struct e820_entry *entry = &table->entries[i]; + u64 final_start, final_end; + u64 entry_end; + + if (entry->type != old_type) + continue; + + entry_end = entry->addr + entry->size; + + /* Completely covered by new range? */ + if (entry->addr >= start && entry_end <= end) { + entry->type = new_type; + real_updated_size += entry->size; + continue; + } + + /* New range is completely covered? */ + if (entry->addr < start && entry_end > end) { + __e820__range_add(table, start, size, new_type); + __e820__range_add(table, end, entry_end - end, entry->type); + entry->size = start - entry->addr; + real_updated_size += size; + continue; + } + + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); + if (final_start >= final_end) + continue; + + __e820__range_add(table, final_start, final_end - final_start, new_type); + + real_updated_size += final_end - final_start; + + /* + * Left range could be head or tail, so need to update + * its size first: + */ + entry->size -= final_end - final_start; + if (entry->addr < final_start) + continue; + + entry->addr = final_end; + } + return real_updated_size; +} + +u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + return __e820__range_update(e820_table, start, size, old_type, new_type); +} + +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +{ + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); +} + +/* Remove a range of memory from the E820 table: */ +u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +{ + int i; + u64 end; + u64 real_removed_size = 0; + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); + if (check_type) + e820_print_type(old_type); + pr_cont("\n"); + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + u64 final_start, final_end; + u64 entry_end; + + if (check_type && entry->type != old_type) + continue; + + entry_end = entry->addr + entry->size; + + /* Completely covered? */ + if (entry->addr >= start && entry_end <= end) { + real_removed_size += entry->size; + memset(entry, 0, sizeof(*entry)); + continue; + } + + /* Is the new range completely covered? */ + if (entry->addr < start && entry_end > end) { + e820__range_add(end, entry_end - end, entry->type); + entry->size = start - entry->addr; + real_removed_size += size; + continue; + } + + /* Partially covered: */ + final_start = max(start, entry->addr); + final_end = min(end, entry_end); + if (final_start >= final_end) + continue; + + real_removed_size += final_end - final_start; + + /* + * Left range could be head or tail, so need to update + * the size first: + */ + entry->size -= final_end - final_start; + if (entry->addr < final_start) + continue; + + entry->addr = final_end; + } + return real_removed_size; +} + +void __init e820__update_table_print(void) +{ + if (e820__update_table(e820_table)) + return; + + pr_info("modified physical RAM map:\n"); + e820__print_table("modified"); +} + +static void __init e820__update_table_kexec(void) +{ + e820__update_table(e820_table_kexec); +} + +#define MAX_GAP_END 0x100000000ull + +/* + * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). + */ +static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) +{ + unsigned long long last = MAX_GAP_END; + int i = e820_table->nr_entries; + int found = 0; + + while (--i >= 0) { + unsigned long long start = e820_table->entries[i].addr; + unsigned long long end = start + e820_table->entries[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true: + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the E820 + * memory space. We pass this space to the PCI subsystem, so + * that it can assign MMIO resources for hotplug or + * unconfigured devices in. + * + * Hopefully the BIOS let enough space left. + */ +__init void e820__setup_pci_gap(void) +{ + unsigned long gapstart, gapsize; + int found; + + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize); + + if (!found) { +#ifdef CONFIG_X86_64 + gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; + pr_err("Cannot find an available gap in the 32-bit address range\n"); + pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); +#else + gapstart = 0x10000000; +#endif + } + + /* + * e820__reserve_resources_late() protects stolen RAM already: + */ + pci_mem_start = gapstart; + + pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); +} + +/* + * Called late during init, in free_initmem(). + * + * Initial e820_table and e820_table_kexec are largish __initdata arrays. + * + * Copy them to a (usually much smaller) dynamically allocated area that is + * sized precisely after the number of e820 entries. + * + * This is done after we've performed all the fixes and tweaks to the tables. + * All functions which modify them are __init functions, which won't exist + * after free_initmem(). + */ +__init void e820__reallocate_tables(void) +{ + struct e820_table *n; + int size; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; + n = kmemdup(e820_table, size, GFP_KERNEL); + BUG_ON(!n); + e820_table = n; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; + n = kmemdup(e820_table_kexec, size, GFP_KERNEL); + BUG_ON(!n); + e820_table_kexec = n; + + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; + n = kmemdup(e820_table_firmware, size, GFP_KERNEL); + BUG_ON(!n); + e820_table_firmware = n; +} + +/* + * Because of the small fixed size of struct boot_params, only the first + * 128 E820 memory entries are passed to the kernel via boot_params.e820_table, + * the remaining (if any) entries are passed via the SETUP_E820_EXT node of + * struct setup_data, which is parsed here. + */ +void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) +{ + int entries; + struct boot_e820_entry *extmap; + struct setup_data *sdata; + + sdata = early_memremap(phys_addr, data_len); + entries = sdata->len / sizeof(*extmap); + extmap = (struct boot_e820_entry *)(sdata->data); + + __append_e820_table(extmap, entries); + e820__update_table(e820_table); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + early_memunmap(sdata, data_len); + pr_info("extended physical RAM map:\n"); + e820__print_table("extended"); +} + +/* + * Find the ranges of physical addresses that do not correspond to + * E820 RAM areas and register the corresponding pages as 'nosave' for + * hibernation (32-bit) or software suspend and suspend to RAM (64-bit). + * + * This function requires the E820 map to be sorted and without any + * overlapping entries. + */ +void __init e820__register_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn = 0; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (pfn < PFN_UP(entry->addr)) + register_nosave_region(pfn, PFN_UP(entry->addr)); + + pfn = PFN_DOWN(entry->addr + entry->size); + + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + register_nosave_region(PFN_UP(entry->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} + +#ifdef CONFIG_ACPI +/* + * Register ACPI NVS memory regions, so that we can save/restore them during + * hibernation and the subsequent resume: + */ +static int __init e820__register_nvs_regions(void) +{ + int i; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type == E820_TYPE_NVS) + acpi_nvs_register(entry->addr, entry->size); + } + + return 0; +} +core_initcall(e820__register_nvs_regions); +#endif + +/* + * Allocate the requested number of bytes with the requsted alignment + * and return (the physical address) to the caller. Also register this + * range in the 'kexec' E820 table as a reserved range. + * + * This allows kexec to fake a new mptable, as if it came from the real + * system. + */ +u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) +{ + u64 addr; + + addr = memblock_phys_alloc(size, align); + if (addr) { + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); + } + + return addr; +} + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT +#endif + +/* + * Find the highest page frame number we have available + */ +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) +{ + int i; + unsigned long last_pfn = 0; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (entry->type != type) + continue; + + start_pfn = entry->addr >> PAGE_SHIFT; + end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + + pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); + return last_pfn; +} + +unsigned long __init e820__end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); +} + +unsigned long __init e820__end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); +} + +static void __init early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +static int userdef __initdata; + +/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ +static int __init parse_memopt(char *p) +{ + u64 mem_size; + + if (!p) + return -EINVAL; + + if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; +#else + pr_warn("mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; +#endif + } + + userdef = 1; + mem_size = memparse(p, &p); + + /* Don't remove all memory when getting "mem={invalid}" parameter: */ + if (mem_size == 0) + return -EINVAL; + + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + +#ifdef CONFIG_MEMORY_HOTPLUG + max_mem_size = mem_size; +#endif + + return 0; +} +early_param("mem", parse_memopt); + +static int __init parse_memmap_one(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { + e820_table->nr_entries = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_RESERVED); + } else if (*p == '!') { + start_at = memparse(p+1, &p); + e820__range_add(start_at, mem_size, E820_TYPE_PRAM); + } else if (*p == '%') { + enum e820_type from = 0, to = 0; + + start_at = memparse(p + 1, &p); + if (*p == '-') + from = simple_strtoull(p + 1, &p, 0); + if (*p == '+') + to = simple_strtoull(p + 1, &p, 0); + if (*p != '\0') + return -EINVAL; + if (from && to) + e820__range_update(start_at, mem_size, from, to); + else if (to) + e820__range_add(start_at, mem_size, to); + else if (from) + e820__range_remove(start_at, mem_size, from, 1); + else + e820__range_remove(start_at, mem_size, 0, 0); + } else { + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + } + + return *p == '\0' ? 0 : -EINVAL; +} + +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} +early_param("memmap", parse_memmap_opt); + +/* + * Reserve all entries from the bootloader's extensible data nodes list, + * because if present we are going to use it later on to fetch e820 + * entries from it: + */ +void __init e820__reserve_setup_data(void) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 pa_data, pa_next; + u32 len; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return; + + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + /* + * SETUP_EFI is supplied by kexec and does not need to be + * reserved. + */ + if (data->type != SETUP_EFI) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + } + + pa_data = pa_next; + early_memunmap(data, len); + } + + e820__update_table(e820_table); + e820__update_table(e820_table_kexec); + + pr_info("extended physical RAM map:\n"); + e820__print_table("reserve setup_data"); +} + +/* + * Called after parse_early_param(), after early parameters (such as mem=) + * have been processed, in which case we already have an E820 table filled in + * via the parameter callback function(s), but it's not sorted and printed yet: + */ +void __init e820__finish_early_params(void) +{ + if (userdef) { + if (e820__update_table(e820_table) < 0) + early_panic("Invalid user supplied memory map"); + + pr_info("user-defined physical RAM map:\n"); + e820__print_table("user"); + } +} + +static const char *__init e820_type_to_string(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return "System RAM"; + case E820_TYPE_ACPI: return "ACPI Tables"; + case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; + case E820_TYPE_UNUSABLE: return "Unusable memory"; + case E820_TYPE_PRAM: return "Persistent Memory (legacy)"; + case E820_TYPE_PMEM: return "Persistent Memory"; + case E820_TYPE_RESERVED: return "Reserved"; + case E820_TYPE_SOFT_RESERVED: return "Soft Reserved"; + default: return "Unknown E820 type"; + } +} + +static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; + case E820_TYPE_ACPI: /* Fall-through: */ + case E820_TYPE_NVS: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + case E820_TYPE_PRAM: /* Fall-through: */ + case E820_TYPE_PMEM: /* Fall-through: */ + case E820_TYPE_RESERVED: /* Fall-through: */ + case E820_TYPE_SOFT_RESERVED: /* Fall-through: */ + default: return IORESOURCE_MEM; + } +} + +static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) +{ + switch (entry->type) { + case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; + case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; + case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; + case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; + case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; + case E820_TYPE_RESERVED_KERN: /* Fall-through: */ + case E820_TYPE_RAM: /* Fall-through: */ + case E820_TYPE_UNUSABLE: /* Fall-through: */ + default: return IORES_DESC_NONE; + } +} + +static bool __init do_mark_busy(enum e820_type type, struct resource *res) +{ + /* this is the legacy bios/dos rom-shadow + mmio region */ + if (res->start < (1ULL<<20)) + return true; + + /* + * Treat persistent memory and other special memory ranges like + * device memory, i.e. reserve it for exclusive use of a driver + */ + switch (type) { + case E820_TYPE_RESERVED: + case E820_TYPE_SOFT_RESERVED: + case E820_TYPE_PRAM: + case E820_TYPE_PMEM: + return false; + case E820_TYPE_RESERVED_KERN: + case E820_TYPE_RAM: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + default: + return true; + } +} + +/* + * Mark E820 reserved areas as busy for the resource manager: + */ + +static struct resource __initdata *e820_res; + +void __init e820__reserve_resources(void) +{ + int i; + struct resource *res; + u64 end; + + res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + SMP_CACHE_BYTES); + if (!res) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*res) * e820_table->nr_entries); + e820_res = res; + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = e820_table->entries + i; + + end = entry->addr + entry->size - 1; + if (end != (resource_size_t)end) { + res++; + continue; + } + res->start = entry->addr; + res->end = end; + res->name = e820_type_to_string(entry); + res->flags = e820_type_to_iomem_type(entry); + res->desc = e820_type_to_iores_desc(entry); + + /* + * Don't register the region that could be conflicted with + * PCI device BAR resources and insert them later in + * pcibios_resource_survey(): + */ + if (do_mark_busy(entry->type, res)) { + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } + res++; + } + + /* Expose the bootloader-provided memory layout to the sysfs. */ + for (i = 0; i < e820_table_firmware->nr_entries; i++) { + struct e820_entry *entry = e820_table_firmware->entries + i; + + firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); + } +} + +/* + * How much should we pad the end of RAM, depending on where it is? + */ +static unsigned long __init ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 64MB for anything above that */ + return 64*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + +void __init e820__reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820_table->nr_entries; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } + + /* + * Try to bump up RAM regions to reasonable boundaries, to + * avoid stolen RAM: + */ + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + u64 start, end; + + if (entry->type != E820_TYPE_RAM) + continue; + + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + + printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); + } +} + +/* + * Pass the firmware (bootloader) E820 map to the kernel and process it: + */ +char *__init e820__memory_setup_default(void) +{ + char *who = "BIOS-e820"; + + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) { + u64 mem_size; + + /* Compare results from other methods and take the one that gives more RAM: */ + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; + who = "BIOS-88"; + } else { + mem_size = boot_params.alt_mem_k; + who = "BIOS-e801"; + } + + e820_table->nr_entries = 0; + e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM); + e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM); + } + + /* We just appended a lot of ranges, sanitize the table: */ + e820__update_table(e820_table); + + return who; +} + +/* + * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader + * E820 map - with an optional platform quirk available for virtual platforms + * to override this method of boot environment processing: + */ +void __init e820__memory_setup(void) +{ + char *who; + + /* This is a firmware interface ABI - make sure we don't break it: */ + BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20); + + who = x86_init.resources.memory_setup(); + + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + + pr_info("BIOS-provided physical RAM map:\n"); + e820__print_table(who); +} + +void __init e820__memblock_setup(void) +{ + int i; + u64 end; + + /* + * The bootstrap memblock region count maximum is 128 entries + * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries + * than that - so allow memblock resizing. + * + * This is safe, because this call happens pretty late during x86 setup, + * so we know about reserved memory regions already. (This is important + * so that memblock resizing does no stomp over reserved areas.) + */ + memblock_allow_resize(); + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + end = entry->addr + entry->size; + if (end != (resource_size_t)end) + continue; + + if (entry->type == E820_TYPE_SOFT_RESERVED) + memblock_reserve(entry->addr, entry->size); + + if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + continue; + + memblock_add(entry->addr, entry->size); + } + + /* Throw away partial pages: */ + memblock_trim_memory(PAGE_SIZE); + + memblock_dump_all(); +} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c new file mode 100644 index 000000000..8e27cbefa --- /dev/null +++ b/arch/x86/kernel/early-quirks.c @@ -0,0 +1,806 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_GART_IOMMU + if ((max_pfn > MAX_DMA32_PFN || force_iommu) && + !gart_iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU." + " Override with iommu=allowed\n"); + gart_iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + +static int __init nvidia_hpet_check(struct acpi_table_header *header) +{ + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ + +static void __init nvidia_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. + */ + if (acpi_use_timer_override) + return; + + if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +#endif + /* RED-PEN skip them on mptables too? */ + +} + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} + +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 d; + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + rev = ati_sbx00_rev(num, slot, func); + if (rev >= 0x40) + acpi_fix_pin2_polarity = 1; + + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) + return; + + if (acpi_use_timer_override) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + +static void __init intel_remapping_check(int num, int slot, int func) +{ + u8 revision; + u16 device; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); + + /* + * Revision <= 13 of all triggering devices id in this quirk + * have a problem draining interrupts when irq remapping is + * enabled, and should be flagged as broken. Additionally + * revision 0x22 of device id 0x3405 has this problem. + */ + if (revision <= 0x13) + set_irq_remapping_broken(); + else if (device == 0x3405 && revision == 0x22) + set_irq_remapping_broken(); +} + +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + */ + +#define KB(x) ((x) * 1024UL) +#define MB(x) (KB (KB (x))) + +static resource_size_t __init i830_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + if (esmramc & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static resource_size_t __init i845_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + switch (tseg_size) { + case I845_TSEG_SIZE_512K: return KB(512); + case I845_TSEG_SIZE_1M: return MB(1); + default: + WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc); + } + return 0; +} + +static resource_size_t __init i85x_tseg_size(void) +{ + u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(esmramc & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static resource_size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static resource_size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static resource_size_t __init i830_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static resource_size_t __init i845_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static resource_size_t __init i85x_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static resource_size_t __init i865_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u16 toud = 0; + + toud = read_pci_config_16(0, 0, 0, I865_TOUD); + + return toud * KB(64) + i845_tseg_size(); +} + +static resource_size_t __init gen3_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u32 bsm; + + /* Almost universally we can find the Graphics Base of Stolen Memory + * at register BSM (0x5c) in the igfx configuration space. On a few + * (desktop) machines this is also mirrored in the bridge device at + * different locations, or in the MCHBAR. + */ + bsm = read_pci_config(num, slot, func, INTEL_BSM); + + return bsm & INTEL_BSM_MASK; +} + +static resource_size_t __init gen11_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u64 bsm; + + bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0); + bsm &= INTEL_BSM_MASK; + bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32; + + return bsm; +} + +static resource_size_t __init i830_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + gms = gmch_ctrl & I830_GMCH_GMS_MASK; + + switch (gms) { + case I830_GMCH_GMS_STOLEN_512: return KB(512); + case I830_GMCH_GMS_STOLEN_1024: return MB(1); + case I830_GMCH_GMS_STOLEN_8192: return MB(8); + /* local memory isn't part of the normal address space */ + case I830_GMCH_GMS_LOCAL: return 0; + default: + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); + } + + return 0; +} + +static resource_size_t __init gen3_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + gms = gmch_ctrl & I855_GMCH_GMS_MASK; + + switch (gms) { + case I855_GMCH_GMS_STOLEN_1M: return MB(1); + case I855_GMCH_GMS_STOLEN_4M: return MB(4); + case I855_GMCH_GMS_STOLEN_8M: return MB(8); + case I855_GMCH_GMS_STOLEN_16M: return MB(16); + case I855_GMCH_GMS_STOLEN_32M: return MB(32); + case I915_GMCH_GMS_STOLEN_48M: return MB(48); + case I915_GMCH_GMS_STOLEN_64M: return MB(64); + case G33_GMCH_GMS_STOLEN_128M: return MB(128); + case G33_GMCH_GMS_STOLEN_256M: return MB(256); + case INTEL_GMCH_GMS_STOLEN_96M: return MB(96); + case INTEL_GMCH_GMS_STOLEN_160M:return MB(160); + case INTEL_GMCH_GMS_STOLEN_224M:return MB(224); + case INTEL_GMCH_GMS_STOLEN_352M:return MB(352); + default: + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); + } + + return 0; +} + +static resource_size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; + + return gms * MB(32); +} + +static resource_size_t __init gen8_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + return gms * MB(32); +} + +static resource_size_t __init chv_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; + + /* + * 0x0 to 0x10: 32MB increments starting at 0MB + * 0x11 to 0x16: 4MB increments starting at 8MB + * 0x17 to 0x1d: 4MB increments start at 36MB + */ + if (gms < 0x11) + return gms * MB(32); + else if (gms < 0x17) + return (gms - 0x11) * MB(4) + MB(8); + else + return (gms - 0x17) * MB(4) + MB(36); +} + +static resource_size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + u16 gms; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + /* 0x0 to 0xef: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if (gms < 0xf0) + return gms * MB(32); + else + return (gms - 0xf0) * MB(4) + MB(4); +} + +struct intel_early_ops { + resource_size_t (*stolen_size)(int num, int slot, int func); + resource_size_t (*stolen_base)(int num, int slot, int func, + resource_size_t size); +}; + +static const struct intel_early_ops i830_early_ops __initconst = { + .stolen_base = i830_stolen_base, + .stolen_size = i830_stolen_size, +}; + +static const struct intel_early_ops i845_early_ops __initconst = { + .stolen_base = i845_stolen_base, + .stolen_size = i830_stolen_size, +}; + +static const struct intel_early_ops i85x_early_ops __initconst = { + .stolen_base = i85x_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops i865_early_ops __initconst = { + .stolen_base = i865_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops gen3_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen3_stolen_size, +}; + +static const struct intel_early_ops gen6_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen6_stolen_size, +}; + +static const struct intel_early_ops gen8_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen8_stolen_size, +}; + +static const struct intel_early_ops gen9_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen9_stolen_size, +}; + +static const struct intel_early_ops chv_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = chv_stolen_size, +}; + +static const struct intel_early_ops gen11_early_ops __initconst = { + .stolen_base = gen11_stolen_base, + .stolen_size = gen9_stolen_size, +}; + +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ +static const struct pci_device_id intel_early_ids[] __initconst = { + INTEL_I830_IDS(&i830_early_ops), + INTEL_I845G_IDS(&i845_early_ops), + INTEL_I85X_IDS(&i85x_early_ops), + INTEL_I865G_IDS(&i865_early_ops), + INTEL_I915G_IDS(&gen3_early_ops), + INTEL_I915GM_IDS(&gen3_early_ops), + INTEL_I945G_IDS(&gen3_early_ops), + INTEL_I945GM_IDS(&gen3_early_ops), + INTEL_VLV_IDS(&gen6_early_ops), + INTEL_PINEVIEW_G_IDS(&gen3_early_ops), + INTEL_PINEVIEW_M_IDS(&gen3_early_ops), + INTEL_I965G_IDS(&gen3_early_ops), + INTEL_G33_IDS(&gen3_early_ops), + INTEL_I965GM_IDS(&gen3_early_ops), + INTEL_GM45_IDS(&gen3_early_ops), + INTEL_G45_IDS(&gen3_early_ops), + INTEL_IRONLAKE_D_IDS(&gen3_early_ops), + INTEL_IRONLAKE_M_IDS(&gen3_early_ops), + INTEL_SNB_D_IDS(&gen6_early_ops), + INTEL_SNB_M_IDS(&gen6_early_ops), + INTEL_IVB_M_IDS(&gen6_early_ops), + INTEL_IVB_D_IDS(&gen6_early_ops), + INTEL_HSW_IDS(&gen6_early_ops), + INTEL_BDW_IDS(&gen8_early_ops), + INTEL_CHV_IDS(&chv_early_ops), + INTEL_SKL_IDS(&gen9_early_ops), + INTEL_BXT_IDS(&gen9_early_ops), + INTEL_KBL_IDS(&gen9_early_ops), + INTEL_CFL_IDS(&gen9_early_ops), + INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), + INTEL_ICL_11_IDS(&gen11_early_ops), + INTEL_EHL_IDS(&gen11_early_ops), + INTEL_TGL_12_IDS(&gen11_early_ops), + INTEL_RKL_IDS(&gen11_early_ops), +}; + +struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); +EXPORT_SYMBOL(intel_graphics_stolen_res); + +static void __init +intel_graphics_stolen(int num, int slot, int func, + const struct intel_early_ops *early_ops) +{ + resource_size_t base, size; + resource_size_t end; + + size = early_ops->stolen_size(num, slot, func); + base = early_ops->stolen_base(num, slot, func, size); + + if (!size || !base) + return; + + end = base + size - 1; + + intel_graphics_stolen_res.start = base; + intel_graphics_stolen_res.end = end; + + printk(KERN_INFO "Reserving Intel graphics memory at %pR\n", + &intel_graphics_stolen_res); + + /* Mark this space as reserved */ + e820__range_add(base, size, E820_TYPE_RESERVED); + e820__update_table(e820_table); +} + +static void __init intel_graphics_quirks(int num, int slot, int func) +{ + const struct intel_early_ops *early_ops; + u16 device; + int i; + + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { + kernel_ulong_t driver_data = intel_early_ids[i].driver_data; + + if (intel_early_ids[i].device != device) + continue; + + early_ops = (typeof(early_ops))driver_data; + + intel_graphics_stolen(num, slot, func, early_ops); + + return; + } +} + +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = true; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!x86_apple_machine) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n", + bus, slot, func); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n", + bus, slot, func); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} + +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) +struct chipset { + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); +}; + +static struct chipset early_qrk[] __initdata = { + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, + { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + 0, intel_graphics_quirks }, + /* + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, + {} +}; + +static void __init early_pci_scan_bus(int bus); + +/** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number + * @slot: slot number + * @func: PCI function + * + * Check the vendor & device ID against the early quirks table. + * + * If the device is single function, let early_pci_scan_bus() know so we don't + * poke at this device again. + */ +static int __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + u8 sec; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return -1; /* no class, treat as single function */ + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + + if (!(type & 0x80)) + return -1; + + return 0; +} + +static void __init early_pci_scan_bus(int bus) +{ + int slot, func; + + /* Poor man's PCI discovery */ + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(bus, slot, func)) + break; + } +} + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c new file mode 100644 index 000000000..d3c531d3b --- /dev/null +++ b/arch/x86/kernel/early_printk.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static unsigned int io_serial_in(unsigned long addr, int offset) +{ + return inb(addr + offset); +} + +static void io_serial_out(unsigned long addr, int offset, int value) +{ + outb(value, addr + offset); +} + +static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; +static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + + while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + serial_out(early_serial_base, TXR, ch); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +static __init void early_serial_hw_init(unsigned divisor) +{ + unsigned char c; + + serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ + serial_out(early_serial_base, IER, 0); /* no interrupt */ + serial_out(early_serial_base, FCR, 0); /* no fifo */ + serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + + c = serial_in(early_serial_base, LCR); + serial_out(early_serial_base, LCR, c | DLAB); + serial_out(early_serial_base, DLL, divisor & 0xff); + serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); + serial_out(early_serial_base, LCR, c & ~DLAB); +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s, "0x", 2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s, "ttyS", 4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + if (*s) { + baud = simple_strtoull(s, &e, 0); + + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* These will always be IO based ports */ + serial_in = io_serial_in; + serial_out = io_serial_out; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} + +#ifdef CONFIG_PCI +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + u32 __iomem *vaddr = (u32 __iomem *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + u32 __iomem *vaddr = (u32 __iomem *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + +/* + * early_pci_serial_init() + * + * This function is invoked when the early_printk param starts with "pciserial" + * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe + * the location of a PCI device that must be a UART device. "force" is optional + * and overrides the use of an UART device with a wrong PCI class code. + */ +static __init void early_pci_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + u8 bus, slot, func; + u32 classcode, bar0; + u16 cmdreg; + char *e; + int force = 0; + + if (*s == ',') + ++s; + + if (*s == 0) + return; + + /* Force the use of an UART device with wrong class code */ + if (!strncmp(s, "force,", 6)) { + force = 1; + s += 6; + } + + /* + * Part the param to get the BDF values + */ + bus = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != ':') + return; + ++s; + slot = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != '.') + return; + ++s; + func = (u8)simple_strtoul(s, &e, 16); + s = e; + + /* A baud might be following */ + if (*s == ',') + s++; + + /* + * Find the device from the BDF + */ + cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); + classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + + /* + * Verify it is a UART type device + */ + if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && + (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || + (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + if (!force) + return; + } + + /* + * Determine if it is IO or memory mapped + */ + if (bar0 & 0x01) { + /* it is IO mapped */ + serial_in = io_serial_in; + serial_out = io_serial_out; + early_serial_base = bar0&0xfffffffc; + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_IO); + } else { + /* It is memory mapped - assume 32-bit alignment */ + serial_in = mem32_serial_in; + serial_out = mem32_serial_out; + /* WARNING! assuming the address is always in the first 4G */ + early_serial_base = + (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_MEMORY); + } + + /* + * Initialize the hardware + */ + if (*s) { + if (strcmp(s, "nocfg") == 0) + /* Sometimes, we want to leave the UART alone + * and assume the BIOS has set it up correctly. + * "nocfg" tells us this is the case, and we + * should do no more setup. + */ + return; + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} +#endif + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +static void early_console_register(struct console *con, int keep_early) +{ + if (con->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static int __init setup_early_printk(char *buf) +{ + int keep; + + if (!buf) + return 0; + + if (early_console) + return 0; + + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } +#ifdef CONFIG_PCI + if (!strncmp(buf, "pciserial", 9)) { + early_pci_serial_init(buf + 9); + early_console_register(&early_serial_console, keep); + buf += 9; /* Keep from match the above "serial" */ + } +#endif + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } +#ifdef CONFIG_EARLY_PRINTK_DBGP + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); +#endif +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); +#endif +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC + if (!strncmp(buf, "xdbc", 4)) + early_xdbc_parse_parameter(buf + 4); +#endif + + buf++; + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c new file mode 100644 index 000000000..38e7d597b --- /dev/null +++ b/arch/x86/kernel/ebda.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +/* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. + * + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. + */ + +#define BIOS_RAM_SIZE_KB_PTR 0x413 + +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) +{ + unsigned int bios_start, ebda_start; + + /* + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. + */ + if (!x86_platform.legacy.reserve_bios_regions) + return; + + /* + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: + */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; + + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. + */ + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); + + /* + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. + */ + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); +} diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000..e963344b0 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * EISA specific code + */ +#include +#include +#include + +#include + +static __init int eisa_bus_probe(void) +{ + void __iomem *p; + + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + + p = ioremap(0x0FFFD9, 4); + if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000..4fe7af58c --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more virtual address space for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rand = rdtsc(); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + /* Install the espfix pud into the kernel page directory */ + pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(0); +} + +void init_espfix_ap(int cpu) +{ + unsigned int page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n, node; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(per_cpu(espfix_stack, cpu))) + return; /* Already initialized */ + + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = READ_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = READ_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + node = cpu_to_node(cpu); + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pmd_p = (pmd_t *)page_address(page); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pte_p = (pte_t *)page_address(page); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); + /* + * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since + * this is mapped to userspace. + */ + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + WRITE_ONCE(espfix_pages[page], stack_page); + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + per_cpu(espfix_stack, cpu) = addr; + per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page + + (addr & ~PAGE_MASK); +} diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile new file mode 100644 index 000000000..78c562145 --- /dev/null +++ b/arch/x86/kernel/fpu/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Build rules for the FPU support code: +# + +obj-y += init.o bugs.o core.o regset.o signal.o xstate.o diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c new file mode 100644 index 000000000..2954fab15 --- /dev/null +++ b/arch/x86/kernel/fpu/bugs.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 FPU bug checks: + */ +#include + +/* + * Boot time CPU/FPU FDIV bug detection code: + */ + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +void __init fpu__init_check_bugs(void) +{ + s32 fdiv_bug; + + /* kernel_fpu_begin/end() relies on patched alternative instructions. */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + return; + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c new file mode 100644 index 000000000..835b94809 --- /dev/null +++ b/arch/x86/kernel/fpu/core.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* + * Represents the initial FPU state. It's mostly (but not completely) zeroes, + * depending on the FPU hardware format: + */ +union fpregs_state init_fpstate __read_mostly; + +/* Track in-kernel FPU usage */ +static DEFINE_PER_CPU(bool, in_kernel_fpu); + +/* + * Track which context is using the FPU on the CPU: + */ +DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + */ +bool irq_fpu_usable(void) +{ + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_irq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + + /* + * AVX512 state is tracked here because its use is + * known to slow the max clock speed of the core. + */ + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + fpu->avx512_timestamp = jiffies; + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} +EXPORT_SYMBOL(copy_fpregs_to_fpstate); + +void kernel_fpu_begin_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + + WARN_ON_FPU(!irq_fpu_usable()); + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, true); + + if (!(current->flags & PF_KTHREAD) && + !test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(¤t->thread.fpu); + } + __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); + +void kernel_fpu_end(void) +{ + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end); + +/* + * Save the FPU state (mark it for reload if necessary): + * + * This only ever gets called for the current task. + */ +void fpu__save(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + fpregs_lock(); + trace_x86_fpu_before_save(fpu); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + if (!copy_fpregs_to_fpstate(fpu)) { + copy_kernel_to_fpregs(&fpu->state); + } + } + + trace_x86_fpu_after_save(fpu); + fpregs_unlock(); +} + +/* + * Legacy x87 fpstate state init: + */ +static inline void fpstate_init_fstate(struct fregs_state *fp) +{ + fp->cwd = 0xffff037fu; + fp->swd = 0xffff0000u; + fp->twd = 0xffffffffu; + fp->fos = 0xffff0000u; +} + +void fpstate_init(union fpregs_state *state) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpstate_init_soft(&state->soft); + return; + } + + memset(state, 0, fpu_kernel_xstate_size); + + if (static_cpu_has(X86_FEATURE_XSAVES)) + fpstate_init_xstate(&state->xsave); + if (static_cpu_has(X86_FEATURE_FXSR)) + fpstate_init_fxstate(&state->fxsave); + else + fpstate_init_fstate(&state->fsave); +} +EXPORT_SYMBOL_GPL(fpstate_init); + +int fpu__copy(struct task_struct *dst, struct task_struct *src) +{ + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + + dst_fpu->last_cpu = -1; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return 0; + + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); + + /* + * Don't let 'init optimized' areas of the XSAVE area + * leak into the child task: + */ + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + + /* + * If the FPU registers are not current just memcpy() the state. + * Otherwise save current FPU registers directly into the child's FPU + * context, without any memory-to-memory copying. + * + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to load them back. ) + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); + + else if (!copy_fpregs_to_fpstate(dst_fpu)) + copy_kernel_to_fpregs(&dst_fpu->state); + + fpregs_unlock(); + + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + + return 0; +} + +/* + * Activate the current task's in-memory FPU context, + * if it has not been used before: + */ +static void fpu__initialize(struct fpu *fpu) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + set_thread_flag(TIF_NEED_FPU_LOAD); + fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); +} + +/* + * This function must be called before we read a task's fpstate. + * + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out. + * + * If the task has used the FPU before then save it. + */ +void fpu__prepare_read(struct fpu *fpu) +{ + if (fpu == ¤t->thread.fpu) + fpu__save(fpu); +} + +/* + * This function must be called before we write a task's fpstate. + * + * Invalidate any cached FPU registers. + * + * After this function call, after registers in the fpstate are + * modified and the child task has woken up, the child task will + * restore the modified FPU state from the modified context. If we + * didn't clear its cached status here then the cached in-registers + * state pending on its former CPU could be restored, corrupting + * the modifications. + */ +void fpu__prepare_write(struct fpu *fpu) +{ + /* + * Only stopped child tasks can be used to modify the FPU + * state in the fpstate buffer: + */ + WARN_ON_FPU(fpu == ¤t->thread.fpu); + + /* Invalidate any cached state: */ + __fpu_invalidate_fpregs_state(fpu); +} + +/* + * Drops current FPU state: deactivates the fpregs and + * the fpstate. NOTE: it still leaves previous contents + * in the fpregs in the eager-FPU case. + * + * This function can be used in cases where we know that + * a state-restore is coming: either an explicit one, + * or a reschedule. + */ +void fpu__drop(struct fpu *fpu) +{ + preempt_disable(); + + if (fpu == ¤t->thread.fpu) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } + + trace_x86_fpu_dropped(fpu); + + preempt_enable(); +} + +/* + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. + */ +static inline void copy_init_fpstate_to_fpregs(u64 features_mask) +{ + if (use_xsave()) + copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); + else if (static_cpu_has(X86_FEATURE_FXSR)) + copy_kernel_to_fxregs(&init_fpstate.fxsave); + else + copy_kernel_to_fregs(&init_fpstate.fsave); + + if (boot_cpu_has(X86_FEATURE_OSPKE)) + copy_init_pkru_to_fpregs(); +} + +/* + * Clear the FPU state back to init state. + * + * Called by sys_execve(), by the signal handler code and by various + * error paths. + */ +static void fpu__clear(struct fpu *fpu, bool user_only) +{ + WARN_ON_FPU(fpu != ¤t->thread.fpu); + + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpu__drop(fpu); + fpu__initialize(fpu); + return; + } + + fpregs_lock(); + + if (user_only) { + if (!fpregs_state_valid(fpu, smp_processor_id()) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + } else { + copy_init_fpstate_to_fpregs(xfeatures_mask_all); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu__clear_user_states(struct fpu *fpu) +{ + fpu__clear(fpu, true); +} + +void fpu__clear_all(struct fpu *fpu) +{ + fpu__clear(fpu, false); +} + +/* + * Load FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(); +} +EXPORT_SYMBOL_GPL(switch_fpu_return); + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * If current FPU state according to its tracking (loaded FPU context on this + * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is + * loaded on return to userland. + */ +void fpregs_assert_state_consistent(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + return; + + WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); +} +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +#endif + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} +EXPORT_SYMBOL_GPL(fpregs_mark_activate); + +/* + * x87 math exception handling: + */ + +int fpu__exception_code(struct fpu *fpu, int trap_nr) +{ + int err; + + if (trap_nr == X86_TRAP_MF) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception. + */ + if (boot_cpu_has(X86_FEATURE_FXSR)) { + cwd = fpu->state.fxsave.cwd; + swd = fpu->state.fxsave.swd; + } else { + cwd = (unsigned short)fpu->state.fsave.cwd; + swd = (unsigned short)fpu->state.fsave.swd; + } + + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = MXCSR_DEFAULT; + + if (boot_cpu_has(X86_FEATURE_XMM)) + mxcsr = fpu->state.fxsave.mxcsr; + + err = ~(mxcsr >> 7) & mxcsr; + } + + if (err & 0x001) { /* Invalid op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + return FPE_FLTINV; + } else if (err & 0x004) { /* Divide by Zero */ + return FPE_FLTDIV; + } else if (err & 0x008) { /* Overflow */ + return FPE_FLTOVF; + } else if (err & 0x012) { /* Denormal, Underflow */ + return FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ + return FPE_FLTRES; + } + + /* + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. + */ + return 0; +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c new file mode 100644 index 000000000..ed73f9b40 --- /dev/null +++ b/arch/x86/kernel/fpu/init.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86 FPU boot time init code: + */ +#include +#include +#include + +#include +#include +#include + +/* + * Initialize the registers found in all CPUs, CR0 and CR4: + */ +static void fpu__init_cpu_generic(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + + if (boot_cpu_has(X86_FEATURE_FXSR)) + cr4_mask |= X86_CR4_OSFXSR; + if (boot_cpu_has(X86_FEATURE_XMM)) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + cr4_set_bits(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* Flush out any pending x87 state: */ +#ifdef CONFIG_MATH_EMULATION + if (!boot_cpu_has(X86_FEATURE_FPU)) + fpstate_init_soft(¤t->thread.fpu.state.soft); + else +#endif + asm volatile ("fninit"); +} + +/* + * Enable all supported FPU features. Called when a CPU is brought online: + */ +void fpu__init_cpu(void) +{ + fpu__init_cpu_generic(); + fpu__init_cpu_xstate(); +} + +static bool __init fpu__probe_without_cpuid(void) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw)); + + pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw); + + return fsw == 0 && (fcw & 0x103f) == 0x003f; +} + +static void __init fpu__init_system_early_generic(void) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID) && + !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { + if (fpu__probe_without_cpuid()) + setup_force_cpu_cap(X86_FEATURE_FPU); + else + setup_clear_cpu_cap(X86_FEATURE_FPU); + } + +#ifndef CONFIG_MATH_EMULATION + if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) { + pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); + for (;;) + asm volatile("hlt"); + } +#endif +} + +/* + * Boot time FPU feature detection code: + */ +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); + +static void __init fpu__init_system_mxcsr(void) +{ + unsigned int mask = 0; + + if (boot_cpu_has(X86_FEATURE_FXSR)) { + /* Static because GCC does not get 16-byte stack alignment right: */ + static struct fxregs_state fxregs __initdata; + + asm volatile("fxsave %0" : "+m" (fxregs)); + + mask = fxregs.mxcsr_mask; + + /* + * If zero then use the default features mask, + * which has all features set, except the + * denormals-are-zero feature bit: + */ + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +/* + * Once per bootup FPU initialization sequences that will run on most x86 CPUs: + */ +static void __init fpu__init_system_generic(void) +{ + /* + * Set up the legacy init FPU context. (xstate init might overwrite this + * with a more modern format, if the CPU supports it.) + */ + fpstate_init(&init_fpstate); + + fpu__init_system_mxcsr(); +} + +/* + * Size of the FPU context state. All tasks in the system use the + * same context size, regardless of what portion they use. + * This is inherent to the XSAVE architecture which puts all state + * components into a single, continuous memory block: + */ +unsigned int fpu_kernel_xstate_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); + +/* + * Enforce that 'MEMBER' is the last field of 'TYPE'. + * + * Align the computed size with alignment of the TYPE, + * because that's how C aligns structs. + */ +#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ + BUILD_BUG_ON(sizeof(TYPE) != \ + ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE))) + +/* + * We append the 'struct fpu' to the task_struct: + */ +static void __init fpu__init_task_struct_size(void) +{ + int task_size = sizeof(struct task_struct); + + /* + * Subtract off the static size of the register state. + * It potentially has a bunch of padding. + */ + task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + + /* + * Add back the dynamically-calculated register state + * size. + */ + task_size += fpu_kernel_xstate_size; + + /* + * We dynamically size 'struct fpu', so we require that + * it be at the end of 'thread_struct' and that + * 'thread_struct' be at the end of 'task_struct'. If + * you hit a compile error here, check the structure to + * see if something got added to the end. + */ + CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); + CHECK_MEMBER_AT_END_OF(struct task_struct, thread); + + arch_task_struct_size = task_size; +} + +/* + * Set up the user and kernel xstate sizes based on the legacy FPU context size. + * + * We set this up first, and later it will be overwritten by + * fpu__init_system_xstate() if the CPU knows about xstates. + */ +static void __init fpu__init_system_xstate_size_legacy(void) +{ + static int on_boot_cpu __initdata = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + /* + * Note that xstate sizes might be overwritten later during + * fpu__init_system_xstate(). + */ + + if (!boot_cpu_has(X86_FEATURE_FPU)) { + fpu_kernel_xstate_size = sizeof(struct swregs_state); + } else { + if (boot_cpu_has(X86_FEATURE_FXSR)) + fpu_kernel_xstate_size = + sizeof(struct fxregs_state); + else + fpu_kernel_xstate_size = + sizeof(struct fregs_state); + } + + fpu_user_xstate_size = fpu_kernel_xstate_size; +} + +/* + * Find supported xfeatures based on cpu features and command-line input. + * This must be called after fpu__init_parse_early_param() is called and + * xfeatures_mask is enumerated. + */ +u64 __init fpu__get_supported_xfeatures_mask(void) +{ + return XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +/* Legacy code to initialize eager fpu mode. */ +static void __init fpu__init_system_ctx_switch(void) +{ + static bool on_boot_cpu __initdata = 1; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; +} + +/* + * Called on the boot CPU once per system bootup, to set up the initial + * FPU state that is later cloned into all processes: + */ +void __init fpu__init_system(void) +{ + fpu__init_system_early_generic(); + + /* + * The FPU has to be operational for some of the + * later FPU init activities: + */ + fpu__init_cpu(); + + fpu__init_system_generic(); + fpu__init_system_xstate_size_legacy(); + fpu__init_system_xstate(); + fpu__init_task_struct_size(); + + fpu__init_system_ctx_switch(); +} diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c new file mode 100644 index 000000000..6bb874441 --- /dev/null +++ b/arch/x86/kernel/fpu/regset.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FPU register's regset abstraction, for ptrace, core dumps, etc. + */ +#include +#include +#include +#include +#include + +/* + * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilities supported by the xsave. + */ +int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return regset->n; +} + +int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + if (boot_cpu_has(X86_FEATURE_FXSR)) + return regset->n; + else + return 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + + if (!boot_cpu_has(X86_FEATURE_FXSR)) + return -ENODEV; + + fpu__prepare_read(fpu); + fpstate_sanitize_xstate(fpu); + + return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state)); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + int ret; + + if (!boot_cpu_has(X86_FEATURE_FXSR)) + return -ENODEV; + + fpu__prepare_write(fpu); + fpstate_sanitize_xstate(fpu); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + + return ret; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave; + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return -ENODEV; + + xsave = &fpu->state.xsave; + + fpu__prepare_read(fpu); + + if (using_compacted_format()) { + copy_xstate_to_kernel(to, xsave); + return 0; + } else { + fpstate_sanitize_xstate(fpu); + /* + * Copy the 48 bytes defined by the software into the xsave + * area in the thread struct, so that we can copy the whole + * area to user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + return membuf_write(&to, xsave, fpu_user_xstate_size); + } +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave; + int ret; + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return -ENODEV; + + /* + * A whole standard-format XSAVE buffer is needed: + */ + if (pos != 0 || count != fpu_user_xstate_size) + return -EFAULT; + + xsave = &fpu->state.xsave; + + fpu__prepare_write(fpu); + + if (using_compacted_format()) { + if (kbuf) + ret = copy_kernel_to_xstate(xsave, kbuf); + else + ret = copy_user_to_xstate(xsave, ubuf); + } else { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + if (!ret) + ret = validate_user_xstate_header(&xsave->header); + } + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void convert_to_fxsr(struct fxregs_state *fxsave, + const struct user_i387_ia32_struct *env) + +{ + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + + fpu__prepare_read(fpu); + + if (!boot_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, to); + + if (!boot_cpu_has(X86_FEATURE_FXSR)) { + return membuf_write(&to, &fpu->state.fsave, + sizeof(struct fregs_state)); + } + + fpstate_sanitize_xstate(fpu); + + if (to.left == sizeof(env)) { + convert_from_fxsr(to.p, target); + return 0; + } + + convert_from_fxsr(&env, target); + return membuf_write(&to, &env, sizeof(env)); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct user_i387_ia32_struct env; + int ret; + + fpu__prepare_write(fpu); + fpstate_sanitize_xstate(fpu); + + if (!boot_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + if (!boot_cpu_has(X86_FEATURE_FXSR)) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fpu->state.fsave, 0, + -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); + + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; + return ret; +} + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c new file mode 100644 index 000000000..b7b92cdf3 --- /dev/null +++ b/arch/x86/kernel/fpu/signal.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FPU signal frame handling routines. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct fxregs_state __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct fxregs_state) + + sizeof(struct xstate_header); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct user_i387_ia32_struct env; + struct _fpstate_32 __user *fp = buf; + + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + copy_fxregs_to_kernel(&tsk->thread.fpu); + fpregs_unlock(); + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct fregs_state __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xregs_state __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xfeatures; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *)(buf + fpu_user_xstate_size)); + + /* + * Read the xfeatures which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xfeatures in the xsave header. + * + * xsave aware apps can change the xfeatures in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xfeatures |= XFEATURE_MASK_FPSSE; + + err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + + return err; +} + +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +{ + int err; + + if (use_xsave()) + err = copy_xregs_to_user(buf); + else if (use_fxsr()) + err = copy_fxregs_to_user((struct fxregs_state __user *) buf); + else + err = copy_fregs_to_user((struct fregs_state __user *) buf); + + if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * Try to save it directly to the user frame with disabled page fault handler. + * If this fails then do the slow path where the FPU state is first saved to + * task's fpu->state and then copy it to the user frame pointed to by the + * aligned pointer 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +{ + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + int ret; + + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + if (!static_cpu_has(X86_FEATURE_FPU)) { + struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, + .left = sizeof(fp)}); + return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + } + + if (!access_ok(buf, size)) + return -EACCES; +retry: + /* + * Load the FPU registers if they are not valid for the current task. + * With a valid FPU state we can attempt to save the state directly to + * userland's stack frame which will likely succeed. If it does not, + * resolve the fault in the user memory and try again. + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __fpregs_load_activate(); + + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + fpregs_unlock(); + + if (ret) { + if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + goto retry; + return -EFAULT; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + return 0; +} + +static inline void +sanitize_restored_user_xstate(union fpregs_state *state, + struct user_i387_ia32_struct *ia32_env, + u64 user_xfeatures, int fx_only) +{ + struct xregs_state *xsave = &state->xsave; + struct xstate_header *header = &xsave->header; + + if (use_xsave()) { + /* + * Clear all feature bits which are not set in + * user_xfeatures and clear all extended features + * for fx_only mode. + */ + u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures; + + /* + * Supervisor state has to be preserved. The sigframe + * restore can only modify user features, i.e. @mask + * cannot contain them. + */ + header->xfeatures &= mask | xfeatures_mask_supervisor(); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + if (ia32_env) + convert_to_fxsr(&state->fxsave, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +{ + u64 init_bv; + int r; + + if (use_xsave()) { + if (fx_only) { + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + + r = copy_user_to_fxregs(buf); + if (!r) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return r; + } else { + init_bv = xfeatures_mask_user() & ~xbv; + + r = copy_user_to_xregs(buf, xbv); + if (!r && unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return r; + } + } else if (use_fxsr()) { + return copy_user_to_fxregs(buf); + } else + return copy_user_to_fregs(buf); +} + +static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +{ + struct user_i387_ia32_struct *envp = NULL; + int state_size = fpu_kernel_xstate_size; + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + struct fpu *fpu = &tsk->thread.fpu; + struct user_i387_ia32_struct env; + u64 user_xfeatures = 0; + int fx_only = 0; + int ret = 0; + + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + if (!buf) { + fpu__clear_user_states(fpu); + return 0; + } + + if (!access_ok(buf, size)) { + ret = -EACCES; + goto out; + } + + if (!static_cpu_has(X86_FEATURE_FPU)) { + ret = fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + goto out; + } + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct fxregs_state); + fx_only = 1; + trace_x86_fpu_xstate_check_failed(fpu); + } else { + state_size = fx_sw_user.xstate_size; + user_xfeatures = fx_sw_user.xfeatures; + } + } + + if ((unsigned long)buf_fx % 64) + fx_only = 1; + + if (!ia32_fxstate) { + /* + * Attempt to restore the FPU registers directly from user + * memory. For that to succeed, the user access cannot cause + * page faults. If it does, fall back to the slow path below, + * going through the kernel buffer with the enabled pagefault + * handler. + */ + fpregs_lock(); + pagefault_disable(); + ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); + pagefault_enable(); + if (!ret) { + + /* + * Restore supervisor states: previous context switch + * etc has done XSAVES and saved the supervisor states + * in the kernel buffer from which they can be restored + * now. + * + * We cannot do a single XRSTORS here - which would + * be nice - because the rest of the FPU registers are + * being restored from a user buffer directly. The + * single XRSTORS happens below, when the user buffer + * has been copied to the kernel one. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + fpregs_mark_activate(); + fpregs_unlock(); + return 0; + } + + /* + * The above did an FPU restore operation, restricted to + * the user portion of the registers, and failed, but the + * microcode might have modified the FPU registers + * nevertheless. + * + * If the FPU registers do not belong to current, then + * invalidate the FPU register state otherwise the task might + * preempt current and return to user space with corrupted + * FPU registers. + * + * In case current owns the FPU registers then no further + * action is required. The fixup below will handle it + * correctly. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __cpu_invalidate_fpregs_state(); + + fpregs_unlock(); + } else { + /* + * For 32-bit frames with fxstate, copy the fxstate so it can + * be reconstructed later. + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto out; + envp = &env; + } + + /* + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + fpregs_lock(); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + + /* + * Supervisor states are not modified by user space input. Save + * current supervisor states first and invalidate the FPU regs. + */ + if (xfeatures_mask_supervisor()) + copy_supervisor_to_kernel(&fpu->state.xsave); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + __fpu_invalidate_fpregs_state(fpu); + fpregs_unlock(); + + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; + + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + if (ret) + goto out; + + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); + + fpregs_lock(); + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + + /* + * Restore previously saved supervisor xstates along with + * copied-in user xstates. + */ + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, + user_xfeatures | xfeatures_mask_supervisor()); + + } else if (use_fxsr()) { + ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); + if (ret) { + ret = -EFAULT; + goto out; + } + + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); + + fpregs_lock(); + if (use_xsave()) { + u64 init_bv; + + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + } + + ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); + } else { + ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); + if (ret) + goto out; + + fpregs_lock(); + ret = copy_kernel_to_fregs_err(&fpu->state.fsave); + } + if (!ret) + fpregs_mark_activate(); + else + fpregs_deactivate(fpu); + fpregs_unlock(); + +out: + if (ret) + fpu__clear_user_states(fpu); + return ret; +} + +static inline int xstate_sigframe_size(void) +{ + return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : + fpu_user_xstate_size; +} + +/* + * Restore FPU state from a sigframe: + */ +int fpu__restore_sig(void __user *buf, int ia32_frame) +{ + void __user *buf_fx = buf; + int size = xstate_sigframe_size(); + + if (ia32_frame && use_fxsr()) { + buf_fx = buf + sizeof(struct fregs_state); + size += sizeof(struct fregs_state); + } + + return __fpu__restore_sig(buf, buf_fx, size); +} + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size) +{ + unsigned long frame_size = xstate_sigframe_size(); + + *buf_fx = sp = round_down(sp - frame_size, 64); + if (ia32_frame && use_fxsr()) { + frame_size += sizeof(struct fregs_state); + sp -= sizeof(struct fregs_state); + } + + *size = frame_size; + + return sp; +} +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +void fpu__init_prepare_fx_sw_frame(void) +{ + int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xfeatures = xfeatures_mask_user(); + fx_sw_reserved.xstate_size = fpu_user_xstate_size; + + if (IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; + } +} + diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c new file mode 100644 index 000000000..b897feb51 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.c @@ -0,0 +1,1464 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Although we spell it out in here, the Processor Trace + * xfeature is completely unused. We use other mechanisms + * to save/restore PT state in Linux. + */ +static const char *xfeature_names[] = +{ + "x87 floating point registers" , + "SSE registers" , + "AVX registers" , + "MPX bounds registers" , + "MPX CSR" , + "AVX-512 opmask" , + "AVX-512 Hi256" , + "AVX-512 ZMM_Hi256" , + "Processor Trace (unused)" , + "Protection Keys User registers", + "PASID state", + "unknown xstate feature" , +}; + +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, + X86_FEATURE_ENQCMD, +}; + +/* + * This represents the full set of bits that should ever be set in a kernel + * XSAVE buffer, both supervisor and user xstates. + */ +u64 xfeatures_mask_all __read_mostly; + +static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; + +/* + * The XSAVE area of kernel can be in standard or compacted format; + * it is always in standard format for user mode. This is the user + * mode standard format size used for signal and ptrace frames. + */ +unsigned int fpu_user_xstate_size; + +/* + * Return whether the system supports a given xfeature. + * + * Also return the name of the (most advanced) feature that the caller requested: + */ +int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) +{ + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; + + if (unlikely(feature_name)) { + long xfeature_idx, max_idx; + u64 xfeatures_print; + /* + * So we use FLS here to be able to print the most advanced + * feature that was requested but is missing. So if a driver + * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the + * missing AVX feature - this is the most informative message + * to users: + */ + if (xfeatures_missing) + xfeatures_print = xfeatures_missing; + else + xfeatures_print = xfeatures_needed; + + xfeature_idx = fls64(xfeatures_print)-1; + max_idx = ARRAY_SIZE(xfeature_names)-1; + xfeature_idx = min(xfeature_idx, max_idx); + + *feature_name = xfeature_names[xfeature_idx]; + } + + if (xfeatures_missing) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(cpu_has_xfeatures); + +static bool xfeature_is_supervisor(int xfeature_nr) +{ + /* + * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1) + * returns ECX[0] set to (1) for a supervisor state, and cleared (0) + * for a user state. + */ + u32 eax, ebx, ecx, edx; + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return ecx & 1; +} + +/* + * When executing XSAVEOPT (or other optimized XSAVE instructions), if + * a processor implementation detects that an FPU state component is still + * (or is again) in its initialized state, it may clear the corresponding + * bit in the header.xfeatures field, and can skip the writeout of registers + * to the corresponding memory layout. + * + * This means that when the bit is zero, the state component might still contain + * some previous - non-initialized register state. + * + * Before writing xstate information to user-space we sanitize those components, + * to always ensure that the memory layout of a feature will be in the init state + * if the corresponding header bit is zero. This is to ensure that user-space doesn't + * see some stale state in the memory layout during signal handling, debugging etc. + */ +void fpstate_sanitize_xstate(struct fpu *fpu) +{ + struct fxregs_state *fx = &fpu->state.fxsave; + int feature_bit; + u64 xfeatures; + + if (!use_xsaveopt()) + return; + + xfeatures = fpu->state.xsave.header.xfeatures; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is up to date. + */ + if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) + return; + + /* + * FP is in init state + */ + if (!(xfeatures & XFEATURE_MASK_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xfeatures & XFEATURE_MASK_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + /* + * First two features are FPU and SSE, which above we handled + * in a special way already: + */ + feature_bit = 0x2; + xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; + + /* + * Update all the remaining memory layouts according to their + * standard xstate layout, if their header bit is in the init + * state: + */ + while (xfeatures) { + if (xfeatures & 0x1) { + int offset = xstate_comp_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy((void *)fx + offset, + (void *)&init_fpstate.xsave + offset, + size); + } + + xfeatures >>= 1; + feature_bit++; + } +} + +/* + * Enable the extended processor state save/restore feature. + * Called once per CPU onlining. + */ +void fpu__init_cpu_xstate(void) +{ + u64 unsup_bits; + + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) + return; + /* + * Unsupported supervisor xstates should not be found in + * the xfeatures mask. + */ + unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", + unsup_bits); + + xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + + cr4_set_bits(X86_CR4_OSXSAVE); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } +} + +static bool xfeature_enabled(enum xfeature xfeature) +{ + return xfeatures_mask_all & BIT_ULL(xfeature); +} + +/* + * Record the offsets and sizes of various xstates contained + * in the XSAVE state memory layout. + */ +static void __init setup_xstate_features(void) +{ + u32 eax, ebx, ecx, edx, i; + /* start at the beginnning of the "extended state" */ + unsigned int last_good_offset = offsetof(struct xregs_state, + extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, + xmm_space); + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + + xstate_sizes[i] = eax; + + /* + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. + */ + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; + + /* + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. + */ + WARN_ONCE(last_good_offset > xstate_offsets[i], + "x86/fpu: misordered xstate at %d\n", last_good_offset); + + last_good_offset = xstate_offsets[i]; + } +} + +static void __init print_xstate_feature(u64 xstate_mask) +{ + const char *feature_name; + + if (cpu_has_xfeatures(xstate_mask, &feature_name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); +} + +/* + * Print out all the supported xstate features: + */ +static void __init print_xstate_features(void) +{ + print_xstate_feature(XFEATURE_MASK_FP); + print_xstate_feature(XFEATURE_MASK_SSE); + print_xstate_feature(XFEATURE_MASK_YMM); + print_xstate_feature(XFEATURE_MASK_BNDREGS); + print_xstate_feature(XFEATURE_MASK_BNDCSR); + print_xstate_feature(XFEATURE_MASK_OPMASK); + print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); + print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); + print_xstate_feature(XFEATURE_MASK_PKRU); + print_xstate_feature(XFEATURE_MASK_PASID); +} + +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + + if (!xfeature_enabled(xfeature_nr)) { + WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n", + xfeature_nr); + return 0; + } + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + return !!(ecx & 2); +} + +/* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave area. + */ +static void __init setup_xstate_comp_offsets(void) +{ + unsigned int next_offset; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[XFEATURE_FP] = 0; + xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, + xmm_space); + + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (xfeature_enabled(i)) + xstate_comp_offsets[i] = xstate_offsets[i]; + } + return; + } + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_comp_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + +/* + * Setup offsets of a supervisor-state-only XSAVES buffer: + * + * The offsets stored in xstate_comp_offsets[] only work for one specific + * value of the Requested Feature BitMap (RFBM). In cases where a different + * RFBM value is used, a different set of offsets is required. This set of + * offsets is for when RFBM=xfeatures_mask_supervisor(). + */ +static void __init setup_supervisor_only_offsets(void) +{ + unsigned int next_offset; + int i; + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_supervisor_only_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xstate_comp_offsets[i], i, xstate_sizes[i]); + } +} + +/* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_PASID) + +/* + * setup the xstate image representing the init state + */ +static void __init setup_init_fpu_buf(void) +{ + static int on_boot_cpu __initdata = 1; + + BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED) != + XFEATURES_INIT_FPSTATE_HANDLED); + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return; + + setup_xstate_features(); + print_xstate_features(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | + xfeatures_mask_all; + + /* + * Init all the features state with header.xfeatures being 0x0 + */ + copy_kernel_to_xregs_booting(&init_fpstate.xsave); + + /* + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVES because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVES is available because XSAVES uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. + */ + fxsave(&init_fpstate.fxsave); +} + +static int xfeature_uncompacted_offset(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + /* + * Only XSAVES supports supervisor states and it uses compacted + * format. Checking a supervisor state's uncompacted offset is + * an error. + */ + if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { + WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); + return -1; + } + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return ebx; +} + +int xfeature_size(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return eax; +} + +/* + * 'XSAVES' implies two different things: + * 1. saving of supervisor/system state + * 2. using the compacted format + * + * Use this function when dealing with the compacted format so + * that it is obvious which aspect of 'XSAVES' is being handled + * by the calling code. + */ +int using_compacted_format(void) +{ + return boot_cpu_has(X86_FEATURE_XSAVES); +} + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_user_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & ~xfeatures_mask_user()) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + +static void __xstate_dump_leaves(void) +{ + int i; + u32 eax, ebx, ecx, edx; + static int should_dump = 1; + + if (!should_dump) + return; + should_dump = 0; + /* + * Dump out a few leaves past the ones that we support + * just in case there are some goodies up there + */ + for (i = 0; i < XFEATURE_MAX + 10; i++) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", + XSTATE_CPUID, i, eax, ebx, ecx, edx); + } +} + +#define XSTATE_WARN_ON(x) do { \ + if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ + if ((nr == nr_macro) && \ + WARN_ONCE(sz != sizeof(__struct), \ + "%s: struct is %zu bytes, cpu state %d bytes\n", \ + __stringify(nr_macro), sizeof(__struct), sz)) { \ + __xstate_dump_leaves(); \ + } \ +} while (0) + +/* + * We have a C struct for each 'xstate'. We need to ensure + * that our software representation matches what the CPU + * tells us about the state's size. + */ +static void check_xstate_against_struct(int nr) +{ + /* + * Ask the CPU for the size of the state. + */ + int sz = xfeature_size(nr); + /* + * Match each CPU state with the corresponding software + * structure. + */ + XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); + XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); + XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); + XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); + XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); + XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); + XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); + XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + + /* + * Make *SURE* to add any feature numbers in below if + * there are "holes" in the xsave state component + * numbers. + */ + if ((nr < XFEATURE_YMM) || + (nr >= XFEATURE_MAX) || + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { + WARN_ONCE(1, "no structure for xstate: %d\n", nr); + XSTATE_WARN_ON(1); + } +} + +/* + * This essentially double-checks what the cpu told us about + * how large the XSAVE buffer needs to be. We are recalculating + * it to be safe. + * + * Dynamic XSAVE features allocate their own buffers and are not + * covered by these checks. Only the size of the buffer for task->fpu + * is checked here. + */ +static void do_extra_xstate_size_checks(void) +{ + int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + + check_xstate_against_struct(i); + /* + * Supervisor state components can be managed only by + * XSAVES, which is compacted-format only. + */ + if (!using_compacted_format()) + XSTATE_WARN_ON(xfeature_is_supervisor(i)); + + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); + /* + * The offset of a given state in the non-compacted + * format is given to us in a CPUID leaf. We check + * them for being ordered (increasing offsets) in + * setup_xstate_features(). + */ + if (!using_compacted_format()) + paranoid_xstate_size = xfeature_uncompacted_offset(i); + /* + * The compacted-format offset always depends on where + * the previous state ended. + */ + paranoid_xstate_size += xfeature_size(i); + } + XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); +} + + +/* + * Get total size of enabled xstates in XCR0 | IA32_XSS. + * + * Note the SDM's wording here. "sub-function 0" only enumerates + * the size of the *user* states. If we use it to size a buffer + * that we use 'XSAVES' on, we could potentially overflow the + * buffer because 'XSAVES' saves system states too. + */ +static unsigned int __init get_xsaves_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} + +/* + * Get the total size of the enabled xstates without the dynamic supervisor + * features. + */ +static unsigned int __init get_xsaves_size_no_dynamic(void) +{ + u64 mask = xfeatures_mask_dynamic(); + unsigned int size; + + if (!mask) + return get_xsaves_size(); + + /* Disable dynamic features. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + + /* + * Ask the hardware what size is required of the buffer. + * This is the size required for the task->fpu buffer. + */ + size = get_xsaves_size(); + + /* Re-enable dynamic features so XSAVES will work on them again. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + + return size; +} + +static unsigned int __init get_xsave_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; +} + +/* + * Will the runtime-enumerated 'xstate_size' fit in the init + * task's statically-allocated buffer? + */ +static bool is_supported_xstate_size(unsigned int test_xstate_size) +{ + if (test_xstate_size <= sizeof(union fpregs_state)) + return true; + + pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", + sizeof(union fpregs_state), test_xstate_size); + return false; +} + +static int __init init_xstate_size(void) +{ + /* Recompute the context size for enabled features: */ + unsigned int possible_xstate_size; + unsigned int xsave_size; + + xsave_size = get_xsave_size(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + possible_xstate_size = get_xsaves_size_no_dynamic(); + else + possible_xstate_size = xsave_size; + + /* Ensure we have the space to store all enabled: */ + if (!is_supported_xstate_size(possible_xstate_size)) + return -EINVAL; + + /* + * The size is OK, we are definitely going to use xsave, + * make it known to the world that we need more space. + */ + fpu_kernel_xstate_size = possible_xstate_size; + do_extra_xstate_size_checks(); + + /* + * User space is always in standard format. + */ + fpu_user_xstate_size = xsave_size; + return 0; +} + +/* + * We enabled the XSAVE hardware, but something went wrong and + * we can not use it. Disable it. + */ +static void fpu__init_disable_system_xstate(void) +{ + xfeatures_mask_all = 0; + cr4_clear_bits(X86_CR4_OSXSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); +} + +/* + * Enable and initialize the xsave feature. + * Called once per system bootup. + */ +void __init fpu__init_system_xstate(void) +{ + unsigned int eax, ebx, ecx, edx; + static int on_boot_cpu __initdata = 1; + int err; + int i; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; + + if (!boot_cpu_has(X86_FEATURE_FPU)) { + pr_info("x86/fpu: No FPU detected\n"); + return; + } + + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { + pr_info("x86/fpu: x87 FPU will use %s\n", + boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); + return; + } + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN_ON_FPU(1); + return; + } + + /* + * Find user xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xfeatures_mask_all = eax + ((u64)edx << 32); + + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + xfeatures_mask_all |= ecx + ((u64)edx << 32); + + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + xfeatures_mask_all); + goto out_disable; + } + + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask_all &= ~BIT_ULL(i); + } + + xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); + + /* Enable xstate instructions to be able to continue with initialization: */ + fpu__init_cpu_xstate(); + err = init_xstate_size(); + if (err) + goto out_disable; + + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); + + fpu__init_prepare_fx_sw_frame(); + setup_init_fpu_buf(); + setup_xstate_comp_offsets(); + setup_supervisor_only_offsets(); + + /* + * CPU capabilities initialization runs before FPU init. So + * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely + * functional, set the feature bit so depending code works. + */ + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); + + print_xstate_offset_size(); + + pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", + xfeatures_mask_all, + fpu_kernel_xstate_size, + boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(); +} + +/* + * Restore minimal FPU state after suspend: + */ +void fpu__resume_cpu(void) +{ + /* + * Restore XCR0 on xsave capable CPUs: + */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } +} + +/* + * Given an xstate feature nr, calculate where in the xsave + * buffer the state is. Callers should ensure that the buffer + * is valid. + */ +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + if (!xfeature_enabled(xfeature_nr)) { + WARN_ON_FPU(1); + return NULL; + } + + return (void *)xsave + xstate_comp_offsets[xfeature_nr]; +} +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Note that if there is no data for the field in the xsave buffer + * this will return NULL. + * + * Inputs: + * xstate: the thread's storage area for all FPU data + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) + * Output: + * address of the state in the xsave area, or NULL if the + * field is not present in the xsave buffer. + */ +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) +{ + /* + * Do we even *have* xsave state? + */ + if (!boot_cpu_has(X86_FEATURE_XSAVE)) + return NULL; + + /* + * We should not ever be requesting features that we + * have not enabled. + */ + WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), + "get of unsupported state"); + /* + * This assumes the last 'xsave*' instruction to + * have requested that 'xfeature_nr' be saved. + * If it did not, we might be seeing and old value + * of the field in the buffer. + * + * This can happen because the last 'xsave' did not + * request that this feature be saved (unlikely) + * or because the "init optimization" caused it + * to not be saved. + */ + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) + return NULL; + + return __raw_xsave_addr(xsave, xfeature_nr); +} +EXPORT_SYMBOL_GPL(get_xsave_addr); + +/* + * This wraps up the common operations that need to occur when retrieving + * data from xsave state. It first ensures that the current task was + * using the FPU and retrieves the data in to a buffer. It then calculates + * the offset of the requested field in the buffer. + * + * This function is safe to call whether the FPU is in use or not. + * + * Note that this only works on the current task. + * + * Inputs: + * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) + * Output: + * address of the state in the xsave area or NULL if the state + * is not present or is in its 'init state'. + */ +const void *get_xsave_field_ptr(int xfeature_nr) +{ + struct fpu *fpu = ¤t->thread.fpu; + + /* + * fpu__save() takes the CPU's xstate registers + * and saves them off to the 'fpu memory buffer. + */ + fpu__save(fpu); + + return get_xsave_addr(&fpu->state.xsave, xfeature_nr); +} + +#ifdef CONFIG_ARCH_HAS_PKEYS + +/* + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. + */ +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) +{ + u32 old_pkru; + int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); + u32 new_pkru_bits = 0; + + /* + * This check implies XSAVE support. OSPKE only gets + * set if we enable XSAVE and we enable PKU in XCR0. + */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return -EINVAL; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + WARN_ON_ONCE(pkey >= arch_max_pkey()); + + /* Set the bits we need in PKRU: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_pkru_bits |= PKRU_AD_BIT; + if (init_val & PKEY_DISABLE_WRITE) + new_pkru_bits |= PKRU_WD_BIT; + + /* Shift the bits in to the correct place in PKRU for pkey: */ + new_pkru_bits <<= pkey_shift; + + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); + + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); + + return 0; +} +#endif /* ! CONFIG_ARCH_HAS_PKEYS */ + +/* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return false; + + if (xfeatures & XFEATURE_MASK_FP) + return false; + + return true; +} + +static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, + void *init_xstate, unsigned int size) +{ + membuf_write(to, from_xstate ? xstate : init_xstate, size); +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) +{ + const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); + struct xregs_state *xinit = &init_fpstate.xsave; + struct xstate_header header; + unsigned int zerofrom; + int i; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= xfeatures_mask_user(); + + /* Copy FP state up to MXCSR */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, + &xinit->i387, off_mxcsr); + + /* Copy MXCSR when SSE or YMM are set in the feature mask */ + copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), + &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, + MXCSR_AND_FLAGS_SIZE); + + /* Copy the remaining FP state */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, + &to, &xsave->i387.st_space, &xinit->i387.st_space, + sizeof(xsave->i387.st_space)); + + /* Copy the SSE state - shared with YMM, but independently managed */ + copy_feature(header.xfeatures & XFEATURE_MASK_SSE, + &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, + sizeof(xsave->i387.xmm_space)); + + /* Zero the padding area */ + membuf_zero(&to, sizeof(xsave->i387.padding)); + + /* Copy xsave->i387.sw_reserved */ + membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); + + /* Copy the user space relevant state of @xsave->header */ + membuf_write(&to, &header, sizeof(header)); + + zerofrom = offsetof(struct xregs_state, extended_state_area); + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + /* + * The ptrace buffer is in non-compacted XSAVE format. + * In non-compacted format disabled features still occupy + * state space, but there is no state to copy from in the + * compacted init_fpstate. The gap tracking will zero this + * later. + */ + if (!(xfeatures_mask_user() & BIT_ULL(i))) + continue; + + /* + * If there was a feature or alignment gap, zero the space + * in the destination buffer. + */ + if (zerofrom < xstate_offsets[i]) + membuf_zero(&to, xstate_offsets[i] - zerofrom); + + copy_feature(header.xfeatures & BIT_ULL(i), &to, + __raw_xsave_addr(xsave, i), + __raw_xsave_addr(xinit, i), + xstate_sizes[i]); + + /* + * Keep track of the last copied state in the non-compacted + * target buffer for gap zeroing. + */ + zerofrom = xstate_offsets[i] + xstate_sizes[i]; + } + + if (to.left) + membuf_zero(&to, to.left); +} + +/* + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set(). + */ +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +{ + unsigned int offset, size; + int i; + struct xstate_header hdr; + + offset = offsetof(struct xregs_state, header); + size = sizeof(hdr); + + memcpy(&hdr, kbuf + offset, size); + + if (validate_user_xstate_header(&hdr)) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + memcpy(dst, kbuf + offset, size); + } + } + + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= hdr.xfeatures; + + return 0; +} + +/* + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. + */ +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) +{ + unsigned int offset, size; + int i; + struct xstate_header hdr; + + offset = offsetof(struct xregs_state, header); + size = sizeof(hdr); + + if (__copy_from_user(&hdr, ubuf + offset, size)) + return -EFAULT; + + if (validate_user_xstate_header(&hdr)) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= hdr.xfeatures; + + return 0; +} + +/* + * Save only supervisor states to the kernel buffer. This blows away all + * old states, and is intended to be used only in __fpu__restore_sig(), where + * user states are restored from the user buffer. + */ +void copy_supervisor_to_kernel(struct xregs_state *xstate) +{ + struct xstate_header *header; + u64 max_bit, min_bit; + u32 lmask, hmask; + int err, i; + + if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (!xfeatures_mask_supervisor()) + return; + + max_bit = __fls(xfeatures_mask_supervisor()); + min_bit = __ffs(xfeatures_mask_supervisor()); + + lmask = xfeatures_mask_supervisor(); + hmask = xfeatures_mask_supervisor() >> 32; + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + if (WARN_ON_FPU(err)) + return; + + /* + * At this point, the buffer has only supervisor states and must be + * converted back to normal kernel format. + */ + header = &xstate->header; + header->xcomp_bv |= xfeatures_mask_all; + + /* + * This only moves states up in the buffer. Start with + * the last state and move backwards so that states are + * not overwritten until after they are moved. Note: + * memmove() allows overlapping src/dst buffers. + */ + for (i = max_bit; i >= min_bit; i--) { + u8 *xbuf = (u8 *)xstate; + + if (!((header->xfeatures >> i) & 1)) + continue; + + /* Move xfeature 'i' into its normal location */ + memmove(xbuf + xstate_comp_offsets[i], + xbuf + xstate_supervisor_only_offsets[i], + xstate_sizes[i]); + } +} + +/** + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features saved into the xsave area + * + * Only the dynamic supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic + * supervisor feature). Besides the dynamic supervisor states, the legacy + * region and XSAVE header are also saved into the xsave area. The supervisor + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* Should never fault when copying to a kernel buffer */ + WARN_ON_FPU(err); +} + +/** + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features restored from the xsave area + * + * Only the dynamic supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of + * dynamic supervisor feature). Besides the dynamic supervisor states, the + * legacy region and XSAVE header are also restored from the xsave area. The + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + + /* Should never fault when copying from a kernel buffer */ + WARN_ON_FPU(err); +} + +#ifdef CONFIG_PROC_PID_ARCH_STATUS +/* + * Report the amount of time elapsed in millisecond since last AVX512 + * use in the task. + */ +static void avx512_status(struct seq_file *m, struct task_struct *task) +{ + unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + long delta; + + if (!timestamp) { + /* + * Report -1 if no AVX512 usage + */ + delta = -1; + } else { + delta = (long)(jiffies - timestamp); + /* + * Cap to LONG_MAX if time difference > LONG_MAX + */ + if (delta < 0) + delta = LONG_MAX; + delta = jiffies_to_msecs(delta); + } + + seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); + seq_putc(m, '\n'); +} + +/* + * Report architecture specific information + */ +int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + /* + * Report AVX512 state if the processor and build option supported. + */ + if (cpu_feature_enabled(X86_FEATURE_AVX512F)) + avx512_status(m, task); + + return 0; +} +#endif /* CONFIG_PROC_PID_ARCH_STATUS */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 000000000..6d546f442 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,689 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Dynamic function tracing support. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int ftrace_poke_late = 0; + +int ftrace_arch_code_modify_prepare(void) + __acquires(&text_mutex) +{ + /* + * Need to grab text_mutex to prevent a race from module loading + * and live kernel patching from changing the text permissions while + * ftrace has it set to "read/write". + */ + mutex_lock(&text_mutex); + ftrace_poke_late = 1; + return 0; +} + +int ftrace_arch_code_modify_post_process(void) + __releases(&text_mutex) +{ + /* + * ftrace_make_{call,nop}() may be called during + * module load, and we need to finish the text_poke_queue() + * that they do, here. + */ + text_poke_finish(); + ftrace_poke_late = 0; + mutex_unlock(&text_mutex); + return 0; +} + +static const char *ftrace_nop_replace(void) +{ + return ideal_nops[NOP_ATOMIC5]; +} + +static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr); +} + +static int ftrace_verify_code(unsigned long ip, const char *old_code) +{ + char cur_code[MCOUNT_INSN_SIZE]; + + /* + * Note: + * We are paranoid about modifying text, as if a bug was to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with probe_kernel_*(), and make + * sure what we read is what we expected it to be before modifying it. + */ + /* read the text we want to modify */ + if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + WARN_ON(1); + return -EFAULT; + } + + /* Make sure it is what we expect it to be */ + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; + WARN_ON(1); + return -EINVAL; + } + + return 0; +} + +/* + * Marked __ref because it calls text_poke_early() which is .init.text. That is + * ok because that call will happen early, during boot, when .init sections are + * still present. + */ +static int __ref +ftrace_modify_code_direct(unsigned long ip, const char *old_code, + const char *new_code) +{ + int ret = ftrace_verify_code(ip, old_code); + if (ret) + return ret; + + /* replace the text with the new text */ + if (ftrace_poke_late) + text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + return 0; +} + +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + const char *new, *old; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(ip, old, new); + + /* + * x86 overrides ftrace_replace_code -- this function will never be used + * in this case. + */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + const char *new, *old; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); +} + +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + const char *new; + + ip = (unsigned long)(&ftrace_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + + return 0; +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *new, *old; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + old = ftrace_nop_replace(); + break; + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec)); + break; + } + + ret = ftrace_verify_code(rec->ip, old); + if (ret) { + ftrace_expected = old; + ftrace_bug(ret, rec); + ftrace_expected = NULL; + return; + } + } + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + switch (ftrace_test_record(rec, enable)) { + case FTRACE_UPDATE_IGNORE: + default: + continue; + + case FTRACE_UPDATE_MAKE_CALL: + case FTRACE_UPDATE_MODIFY_CALL: + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec)); + break; + + case FTRACE_UPDATE_MAKE_NOP: + new = ftrace_nop_replace(); + break; + } + + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + ftrace_update_record(rec, enable); + } + text_poke_finish(); +} + +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} + +/* Currently only x86_64 supports dynamic trampolines */ +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_MODULES +#include +/* Module allocation simplifies allocating memory for code */ +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} +static inline void tramp_free(void *tramp) +{ + module_memfree(tramp); +} +#else +/* Trampolines can only be created if modules are supported */ +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} +static inline void tramp_free(void *tramp) { } +#endif + +/* Defined as markers to the end of the ftrace default trampolines */ +extern void ftrace_regs_caller_end(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); +extern void ftrace_caller_op_ptr(void); +extern void ftrace_regs_caller_op_ptr(void); +extern void ftrace_regs_caller_jmp(void); + +/* movq function_trace_op(%rip), %rdx */ +/* 0x48 0x8b 0x15 */ +#define OP_REF_SIZE 7 + +/* + * The ftrace_ops is passed to the function callback. Since the + * trampoline only services a single ftrace_ops, we can pass in + * that ops directly. + * + * The ftrace_op_code_union is used to create a pointer to the + * ftrace_ops that will be passed to the callback function. + */ +union ftrace_op_code_union { + char code[OP_REF_SIZE]; + struct { + char op[3]; + int offset; + } __attribute__((packed)); +}; + +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned long start_offset; + unsigned long end_offset; + unsigned long op_offset; + unsigned long call_offset; + unsigned long jmp_offset; + unsigned long offset; + unsigned long npages; + unsigned long size; + unsigned long *ptr; + void *trampoline; + void *ip; + /* 48 8b 15 is movq (%rip), %rdx */ + unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; + union ftrace_op_code_union op_ptr; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + call_offset = (unsigned long)ftrace_regs_call; + jmp_offset = (unsigned long)ftrace_regs_caller_jmp; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + op_offset = (unsigned long)ftrace_caller_op_ptr; + call_offset = (unsigned long)ftrace_call; + jmp_offset = 0; + } + + size = end_offset - start_offset; + + /* + * Allocate enough size to store the ftrace_caller code, + * the iret , as well as the address of the ftrace_ops this + * trampoline is used for. + */ + trampoline = alloc_tramp(size + RET_SIZE + sizeof(void *)); + if (!trampoline) + return 0; + + *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); + + /* Copy ftrace_caller onto the trampoline memory */ + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) + goto fail; + + ip = trampoline + size; + + /* The trampoline ends with ret(q) */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + memcpy(ip, text_gen_insn(JMP32_INSN_OPCODE, ip, &__x86_return_thunk), JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq)); + + /* No need to test direct calls on created trampolines */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + /* NOP the jnz 1f; but make sure it's a 2 byte jnz */ + ip = trampoline + (jmp_offset - start_offset); + if (WARN_ON(*(char *)ip != 0x75)) + goto fail; + ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2); + if (ret < 0) + goto fail; + } + + /* + * The address of the ftrace_ops that is used for this trampoline + * is stored at the end of the trampoline. This will be used to + * load the third parameter for the callback. Basically, that + * location at the end of the trampoline takes the place of + * the global function_trace_op variable. + */ + + ptr = (unsigned long *)(trampoline + size + RET_SIZE); + *ptr = (unsigned long)ops; + + op_offset -= start_offset; + memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); + + /* Are we pointing to the reference? */ + if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) + goto fail; + + /* Load the contents of ptr into the callback parameter */ + offset = (unsigned long)ptr; + offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE; + + op_ptr.offset = offset; + + /* put in the new offset to the ftrace_ops */ + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + + /* put in the call to the function */ + mutex_lock(&text_mutex); + call_offset -= start_offset; + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, + trampoline + call_offset, + ftrace_ops_get_func(ops)), CALL_INSN_SIZE); + mutex_unlock(&text_mutex); + + /* ALLOC_TRAMP flags lets us know we created it */ + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + set_vm_flush_reset_perms(trampoline); + + if (likely(system_state != SYSTEM_BOOTING)) + set_memory_ro((unsigned long)trampoline, npages); + set_memory_x((unsigned long)trampoline, npages); + return (unsigned long)trampoline; +fail: + tramp_free(trampoline); + return 0; +} + +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + +static unsigned long calc_trampoline_call_offset(bool save_regs) +{ + unsigned long start_offset; + unsigned long call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller; + call_offset = (unsigned long)ftrace_regs_call; + } else { + start_offset = (unsigned long)ftrace_caller; + call_offset = (unsigned long)ftrace_call; + } + + return call_offset - start_offset; +} + +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + ftrace_func_t func; + unsigned long offset; + unsigned long ip; + unsigned int size; + const char *new; + + if (!ops->trampoline) { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + return; + } + + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + ip = ops->trampoline + offset; + func = ftrace_ops_get_func(ops); + + mutex_lock(&text_mutex); + /* Do a safe modify in case the trampoline is executing */ + new = ftrace_call_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + mutex_unlock(&text_mutex); +} + +/* Return the address of the function the trampoline calls */ +static void *addr_from_call(void *ptr) +{ + union text_poke_insn call; + int ret; + + ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE); + if (WARN_ON_ONCE(ret < 0)) + return NULL; + + /* Make sure this is a call */ + if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) { + pr_warn("Expected E8, got %x\n", call.opcode); + return NULL; + } + + return ptr + CALL_INSN_SIZE + call.disp; +} + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + +/* + * If the ops->trampoline was not allocated, then it probably + * has a static trampoline func, or is the ftrace caller itself. + */ +static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + bool save_regs = rec->flags & FTRACE_FL_REGS_EN; + void *ptr; + + if (ops && ops->trampoline) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* + * We only know about function graph tracer setting as static + * trampoline. + */ + if (ops->trampoline == FTRACE_GRAPH_ADDR) + return (void *)prepare_ftrace_return; +#endif + return NULL; + } + + offset = calc_trampoline_call_offset(save_regs); + + if (save_regs) + ptr = (void *)FTRACE_REGS_ADDR + offset; + else + ptr = (void *)FTRACE_ADDR + offset; + + return addr_from_call(ptr); +} + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return static_tramp_func(ops, rec); + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +{ + return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); +} + +static int ftrace_mod_jmp(unsigned long ip, void *func) +{ + const char *new; + + new = ftrace_jmp_replace(ip, (unsigned long)func); + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_graph_caller); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_stub); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long old; + int faulted; + + /* + * When resuming from suspend-to-ram, this function can be indirectly + * called from early CPU startup code while the CPU is in real mode, + * which would fail miserably. Make sure the stack pointer is a + * virtual address. + * + * This check isn't as accurate as virt_addr_valid(), but it should be + * good enough for this purpose, and it's fast. + */ + if (unlikely((long)__builtin_frame_address(0) >= 0)) + return; + + if (unlikely(ftrace_graph_is_dead())) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " _ASM_MOV " (%[parent]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" + " movl $0, %[faulted]\n" + "3:\n" + + ".section .fixup, \"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 4b) + + : [old] "=&r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + if (function_graph_enter(old, self_addr, frame_pointer, parent)) + *parent = old; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000..a0ed0e4a2 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +SYM_FUNC_START(__fentry__) + RET +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) + +SYM_CODE_START(ftrace_caller) + +#ifdef CONFIG_FRAME_POINTER + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ + pushl %ebp + movl %esp, %ebp + pushl 2*4(%esp) /* function ip */ + + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + +#ifdef CONFIG_FRAME_POINTER + /* Load parent ebp into edx */ + movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ + /* Get the parent ip */ + movl 4(%edx), %edx /* edx has ebp */ + + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +#ifdef CONFIG_FRAME_POINTER + popl %ebp + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +#endif +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) + RET +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_regs_caller) + /* + * We're here from an mcount/fentry CALL, and the stack frame looks like: + * + * + * RET-IP + * + * The purpose of this function is to call out in an emulated INT3 + * environment with a stack frame like: + * + * + * gap / RET-IP + * gap + * gap + * gap + * pt_regs + * + * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds + */ + subl $3*4, %esp # RET-IP + 3 gaps + pushl %ss # ss + pushl %esp # points at ss + addl $5*4, (%esp) # make it point at + pushfl # flags + pushl $__KERNEL_CS # cs + pushl 7*4(%esp) # ip <- RET-IP + pushl $0 # orig_eax + + pushl %gs + pushl %fs + pushl %es + pushl %ds + + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + ENCODE_FRAME_POINTER + + movl PT_EIP(%esp), %eax # 1st argument: IP + subl $MCOUNT_INSN_SIZE, %eax + movl 21*4(%esp), %edx # 2nd argument: parent ip + movl function_trace_op, %ecx # 3rd argument: ftrace_pos + pushl %esp # 4th argument: pt_regs + +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + call ftrace_stub + + addl $4, %esp # skip 4th argument + + /* place IP below the new SP */ + movl PT_OLDESP(%esp), %eax + movl PT_EIP(%esp), %ecx + movl %ecx, -4(%eax) + + /* place EAX below that */ + movl PT_EAX(%esp), %ecx + movl %ecx, -8(%eax) + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + + lea -8(%eax), %esp + popl %eax + + jmp .Lftrace_ret +SYM_CODE_END(ftrace_regs_caller) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_CODE_START(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ + lea 4*4(%esp), %edx + movl $0, %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + RET +SYM_CODE_END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl $0, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + JMP_NOSPEC ecx +#endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S new file mode 100644 index 000000000..5b2dabedc --- /dev/null +++ b/arch/x86/kernel/ftrace_64.S @@ -0,0 +1,354 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014 Steven Rostedt, Red Hat Inc + */ + +#include +#include +#include +#include +#include +#include +#include + + .code64 + .section .text, "ax" + +#ifdef CONFIG_FRAME_POINTER +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 0 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ + pushq %rbp + + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ + + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + movq $0, ORIG_RAX(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ +#ifdef CONFIG_FRAME_POINTER + movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs save=0 + + /* ftrace_regs_caller or frame pointers require this */ + movq RBP(%rsp), %rbp + + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + addq $MCOUNT_REG_SIZE-\save, %rsp + + .endm + +#ifdef CONFIG_DYNAMIC_FTRACE + +SYM_FUNC_START(__fentry__) + RET +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) + +SYM_FUNC_START(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + +SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + call ftrace_stub + + restore_mcount_regs + + /* + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. + */ +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + + jmp ftrace_epilogue +SYM_FUNC_END(ftrace_caller); + +SYM_FUNC_START(ftrace_epilogue) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) + jmp ftrace_stub +#endif + +/* + * This is weak to keep gas from relaxing the jumps. + */ +SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) + UNWIND_HINT_FUNC + RET +SYM_FUNC_END(ftrace_epilogue) + +SYM_FUNC_START(ftrace_regs_caller) + /* Save the current flags before any operations that can change them */ + pushfq + + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + +SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq MCOUNT_REG_SIZE(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx + movq %rcx, RSP(%rsp) + + ENCODE_FRAME_POINTER + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBX(%rsp), %rbx + + movq ORIG_RAX(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE-8(%rsp) + + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ + movq ORIG_RAX(%rsp), %rax + testq %rax, %rax +SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + jnz 1f + + restore_mcount_regs + /* Restore flags */ + popfq + + /* + * As this jmp to ftrace_epilogue can be a short jump + * it must not be copied into the trampoline. + * The trampoline will add the code to jump + * to the return. + */ +SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) + jmp ftrace_epilogue + + /* Swap the flags with orig_rax */ +1: movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + /* Restore flags */ + popfq + UNWIND_HINT_FUNC + jmp ftrace_epilogue + +SYM_FUNC_END(ftrace_regs_caller) + + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +SYM_FUNC_START(__fentry__) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +fgraph_trace: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) + RET + +trace: + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ + movq ftrace_trace_function, %r8 + CALL_NOSPEC r8 + restore_mcount_regs + + jmp fgraph_trace +SYM_FUNC_END(__fentry__) +EXPORT_SYMBOL(__fentry__) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_FUNC_START(ftrace_graph_caller) + /* Saves rbp into %rdx and fills first parameter */ + save_mcount_regs + + leaq MCOUNT_REG_SIZE+8(%rsp), %rsi + movq $0, %rdx /* No framepointers needed */ + call prepare_ftrace_return + + restore_mcount_regs + + RET +SYM_FUNC_END(ftrace_graph_caller) + +SYM_CODE_START(return_to_handler) + UNWIND_HINT_EMPTY + subq $16, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + + addq $16, %rsp + /* + * Jump back to the old return address. This cannot be JMP_NOSPEC rdi + * since IBT would demand that contain ENDBR, which simply isn't so for + * return addresses. Use a retpoline here to keep the RSB balanced. + */ + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop + int3 +.Ldo_rop: + mov %rdi, (%rsp) + UNWIND_HINT_FUNC + RET +SYM_CODE_END(return_to_handler) +#endif diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c new file mode 100644 index 000000000..ec6fefbfd --- /dev/null +++ b/arch/x86/kernel/head32.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/arch/i386/kernel/head32.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2007 Eric Biederman + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; +} + +asmlinkage __visible void __init i386_start_kernel(void) +{ + /* Make sure IDT is set up before any exception happens */ + idt_setup_early_handler(); + + cr4_init_shadow(); + + sanitize_boot_params(&boot_params); + + x86_early_init_platform_quirks(); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } + + start_kernel(); +} + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. Note the upper half of each PMD or PTE are + * always zero at this stage. + */ +void __init mk_early_pgtbl_32(void) +{ +#ifdef __pa +#undef __pa +#endif +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) + pte_t pte, *ptep; + int i; + unsigned long *ptr; + /* Enough space to fit pagetables for the low memory linear map */ + const unsigned long limit = __pa(_end) + + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT); +#ifdef CONFIG_X86_PAE + pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd); +#define SET_PL2(pl2, val) { (pl2).pmd = (val); } +#else + pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table); +#define SET_PL2(pl2, val) { (pl2).pgd = (val); } +#endif + + ptep = (pte_t *)__pa(__brk_base); + pte.pte = PTE_IDENT_ATTR; + + while ((pte.pte & PTE_PFN_MASK) < limit) { + + SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR); + *pl2p = pl2; +#ifndef CONFIG_X86_PAE + /* Kernel PDE entry */ + *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2; +#endif + for (i = 0; i < PTRS_PER_PTE; i++) { + *ptep = pte; + pte.pte += PAGE_SIZE; + ptep++; + } + + pl2p++; + } + + ptr = (unsigned long *)__pa(&max_pfn_mapped); + /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */ + *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + + ptr = (unsigned long *)__pa(&_brk_end); + *ptr = (unsigned long)ptep + PAGE_OFFSET; +} + diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c new file mode 100644 index 000000000..8596b4dca --- /dev/null +++ b/arch/x86/kernel/head64.c @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + */ + +#define DISABLE_BRANCH_PROFILING + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Manage page tables very early on. + */ +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt; +pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); + +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled __ro_after_init; +unsigned int pgdir_shift __ro_after_init = 39; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 1; +EXPORT_SYMBOL(ptrs_per_p4d); +#endif + +#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT +unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; +EXPORT_SYMBOL(page_offset_base); +unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; +EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; +EXPORT_SYMBOL(vmemmap_base); +#endif + +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { + .size = sizeof(startup_gdt)-1, + .address = 0, +}; + +#define __head __section(".head.text") + +static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +{ + return ptr - (void *)_text + (void *)physaddr; +} + +static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +#ifdef CONFIG_X86_5LEVEL +static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) +{ + return fixup_pointer(ptr, physaddr); +} + +static bool __head check_la57_support(unsigned long physaddr) +{ + /* + * 5-level paging is detected and enabled at kernel decomression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) + return false; + + *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; + *fixup_int(&pgdir_shift, physaddr) = 48; + *fixup_int(&ptrs_per_p4d, physaddr) = 512; + *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; + *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; + *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + + return true; +} +#else +static bool __head check_la57_support(unsigned long physaddr) +{ + return false; +} +#endif + +/* Code in __startup_64() can be relocated during execution, but the compiler + * doesn't have to generate PC-relative relocations when accessing globals from + * that function. Clang actually does not generate them, which leads to + * boot-time crashes. To work around this problem, every global pointer must + * be adjusted using fixup_pointer(). + */ +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) +{ + unsigned long vaddr, vaddr_end; + unsigned long load_delta, *p; + unsigned long pgtable_flags; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + pteval_t *mask_ptr; + bool la57; + int i; + unsigned int *next_pgt_ptr; + + la57 = check_la57_support(physaddr); + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_PAGE_MASK) + for (;;); + + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(bp); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = fixup_pointer(&early_top_pgt, physaddr); + p = pgd + pgd_index(__START_KERNEL_map); + if (la57) + *p = (unsigned long)level4_kernel_pgt; + else + *p = (unsigned long)level3_kernel_pgt; + *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + + if (la57) { + p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d[511] += load_delta; + } + + pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud[510] += load_delta; + pud[511] += load_delta; + + pmd = fixup_pointer(level2_fixmap_pgt, physaddr); + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); + pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (la57) { + p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], + physaddr); + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); + pmd_entry &= *mask_ptr; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. + */ + + pmd = fixup_pointer(level2_kernel_pgt, physaddr); + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ + *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) +{ + memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); + next_early_pgt = 0; + write_cr3(__sme_pa_nodebug(early_top_pgt)); +} + +/* Create a new PMD entry */ +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pgdval_t pgd, *pgd_p; + p4dval_t p4d, *p4d_p; + pudval_t pud, *pud_p; + pmdval_t *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) + return false; + +again: + pgd_p = &early_top_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (!pgtable_l5_enabled()) + p4d_p = pgd_p; + else if (pgd) + p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + p4d_p += p4d_index(address); + p4d = *p4d_p; + + if (p4d) + pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd_p[pmd_index(address)] = pmd; + + return true; +} + +static bool __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ + if (trapnr == X86_TRAP_PF && + early_make_pgtable(native_read_cr2())) + return; + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && + trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) + return; + + early_fixup_exception(regs, trapnr); +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); +} + +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + +static void __init copy_bootdata(char *real_mode_data) +{ + char * command_line; + unsigned long cmd_line_ptr; + + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + + memcpy(&boot_params, real_mode_data, sizeof(boot_params)); + sanitize_boot_params(&boot_params); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); +} + +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) +{ + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); + BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + + cr4_init_shadow(); + + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + + clear_bss(); + + clear_page(init_top_pgt); + + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + + kasan_early_init(); + + idt_setup_early_handler(); + + copy_bootdata(__va(real_mode_data)); + + /* + * Load microcode early on BSP. + */ + load_ucode_bsp(); + + /* set init_top_pgt kernel high mapping*/ + init_top_pgt[511] = early_top_pgt[511]; + + x86_64_start_reservations(real_mode_data); +} + +void __init x86_64_start_reservations(char *real_mode_data) +{ + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); + + x86_early_init_platform_quirks(); + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + default: + break; + } + + start_kernel(); +} + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { + .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, + .address = 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + struct idt_data data; + gate_desc desc; + + init_idt_data(&data, n, handler); + idt_init_desc(&desc, &data); + native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ + struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); + gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + void *handler; + + /* VMM Communication Exception */ + handler = fixup_pointer(vc_no_ghcb, physbase); + set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + } + + desc->address = (unsigned long)idt; + native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ + /* VMM Communication Exception */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + + bringup_idt_descr.address = (unsigned long)bringup_idt_table; + native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ + /* Load GDT */ + startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S new file mode 100644 index 000000000..3f1691b89 --- /dev/null +++ b/arch/x86/kernel/head_32.S @@ -0,0 +1,597 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Enhanced CPU detection and feature setting code by Mike Jagdis + * and Martin Mares, November 1997. + */ + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + + +#define SIZEOF_PTREGS 17*4 + +/* + * Worst-case size of the kernel mapping we need to make: + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. + */ +KERNEL_PAGES = LOWMEM_PAGES + +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE +RESERVE_BRK(pagetables, INIT_MAP_SIZE) + +/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. + */ +__HEAD +SYM_CODE_START(startup_32) + movl pa(initial_stack),%ecx + +/* + * Set segments to known values. + */ + lgdt pa(boot_gdt_descr) + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp + +/* + * Clear BSS first so that there are no surprises... + */ + cld + xorl %eax,%eax + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx + subl %edi,%ecx + shrl $2,%ecx + rep ; stosl +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + * With the kexec as boot loader, parameter segment might be loaded beyond + * kernel image and might not even be addressable by early boot page tables. + * (kexec on panic case). Hence copy out the parameters before initializing + * page tables. + */ + movl $pa(boot_params),%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl pa(boot_params) + NEW_CL_POINTER,%esi + andl %esi,%esi + jz 1f # No command line + movl $pa(boot_command_line),%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: + +#ifdef CONFIG_OLPC + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + +#ifdef CONFIG_MICROCODE + /* Early load ucode on BSP. */ + call load_ucode_bsp +#endif + + /* Create early pagetables. */ + call mk_early_pgtbl_32 + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax +#ifdef CONFIG_X86_PAE +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) +#else + movl %eax,pa(initial_page_table+0xffc) +#endif + + jmp .Ldefault_entry +SYM_CODE_END(startup_32) + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +SYM_FUNC_START(start_cpu0) + movl initial_stack, %ecx + movl %ecx, %esp + call *(initial_code) +1: jmp 1b +SYM_FUNC_END(start_cpu0) +#endif + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later + */ +SYM_FUNC_START(startup_32_smp) + cld + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl pa(initial_stack),%ecx + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp + +#ifdef CONFIG_MICROCODE + /* Early load ucode on AP. */ + call load_ucode_ap +#endif + +.Ldefault_entry: + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +/* + * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave + * bits like NT set. This would confuse the debugger if this code is traced. So + * initialize them properly now before switching to protected mode. That means + * DF in particular (even though we have cleared it earlier after copying the + * command line) because GCC expects it. + */ + pushl $0 + popfl + +/* + * New page tables may be in 4Mbyte page mode and may be using the global pages. + * + * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists + * if and only if CPUID exists and has flags other than the FPU flag set. + */ + movl $-1,pa(X86_CPUID) # preset CPUID level + movl $X86_EFLAGS_ID,%ecx + pushl %ecx + popfl # set EFLAGS=ID + pushfl + popl %eax # get EFLAGS + testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? + jz .Lenable_paging # hw disallowed setting of ID bit + # which means no CPUID and no CR4 + + xorl %eax,%eax + cpuid + movl %eax,pa(X86_CPUID) # save largest std CPUID function + + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 + + movl pa(mmu_cr4_features),%eax + movl %eax,%cr4 + + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz .Lenable_paging + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja .Lenable_paging + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + + mov $0x80000001, %eax + cpuid + /* Execute Disable bit supported? */ + btl $(X86_FEATURE_NX & 31), %edx + jnc .Lenable_paging + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + btsl $_EFER_NX, %eax + /* Make changes effective */ + wrmsr + +.Lenable_paging: + +/* + * Enable paging + */ + movl $pa(initial_page_table), %eax + movl %eax,%cr3 /* set the page table pointer.. */ + movl $CR0_STATE,%eax + movl %eax,%cr0 /* ..and set paging (PG) bit */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ +1: + /* Shift the stack pointer to a virtual address */ + addl $__PAGE_OFFSET, %esp + +/* + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. + */ + movl setup_once_ref,%eax + andl %eax,%eax + jz 1f # Did we do this already? + call *%eax +1: + +/* + * Check if it is 486 + */ + movb $4,X86 # at least 486 + cmpl $-1,X86_CPUID + je .Lis486 + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + orl %eax,%eax # do we have processor info as well? + je .Lis486 + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_STEPPING + movl %edx,X86_CAPABILITY + +.Lis486: + movl $0x50022,%ecx # set AM, WP, NE and MP + movl %cr0,%eax + andl $0x80000011,%eax # Save PG,PE,ET + orl %ecx,%eax + movl %eax,%cr0 + + lgdt early_gdt_descr + ljmp $(__KERNEL_CS),$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs + + xorl %eax,%eax # Clear LDT + lldt %ax + + call *(initial_code) +1: jmp 1b +SYM_FUNC_END(startup_32_smp) + +#include "verify_cpu.S" + +/* + * setup_once + * + * The setup work we only want to run on the BSP. + * + * Warning: %esi is live across this function. + */ +__INIT +setup_once: +#ifdef CONFIG_STACKPROTECTOR + /* + * Configure the stack canary. The linker can't handle this by + * relocation. Manually set base address in stack canary + * segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + andl $0,setup_once_ref /* Once is enough, thanks */ + RET + +SYM_FUNC_START(early_idt_handler_array) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler_common + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +SYM_FUNC_END(early_idt_handler_array) + +SYM_CODE_START_LOCAL(early_idt_handler_common) + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ + cld + + incl %ss:early_recursion_flag + + /* The vector number is in pt_regs->gs */ + + cld + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ + pushl %eax /* pt_regs->ax */ + pushl %ebp /* pt_regs->bp */ + pushl %edi /* pt_regs->di */ + pushl %esi /* pt_regs->si */ + pushl %edx /* pt_regs->dx */ + pushl %ecx /* pt_regs->cx */ + pushl %ebx /* pt_regs->bx */ + + /* Fix up DS and ES */ + movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + + /* Load the vector number into EDX */ + movl PT_GS(%esp), %edx + + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ + movw %gs, PT_GS(%esp) + + movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ + call early_fixup_exception + + popl %ebx /* pt_regs->bx */ + popl %ecx /* pt_regs->cx */ + popl %edx /* pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ + decl %ss:early_recursion_flag + addl $4, %esp /* pop pt_regs->orig_ax */ + iret +SYM_CODE_END(early_idt_handler_common) + +/* This is the default interrupt "handler" :-) */ +SYM_FUNC_START(early_ignore_irq) + cld +#ifdef CONFIG_PRINTK + pushl %eax + pushl %ecx + pushl %edx + pushl %es + pushl %ds + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg + call printk + + call dump_stack + + addl $(5*4),%esp + popl %ds + popl %es + popl %edx + popl %ecx + popl %eax +#endif + iret + +hlt_loop: + hlt + jmp hlt_loop +SYM_FUNC_END(early_ignore_irq) + +__INITDATA + .align 4 +SYM_DATA(early_recursion_flag, .long 0) + +__REFDATA + .align 4 +SYM_DATA(initial_code, .long i386_start_kernel) +SYM_DATA(setup_once_ref, .long setup_once) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PGD_ALIGN +#ifdef CONFIG_X86_PAE +.globl initial_pg_pmd +initial_pg_pmd: + .fill 1024*KPMDS,4,0 +#else +.globl initial_page_table +initial_page_table: + .fill 1024,4,0 +#endif + .align PGD_ALIGN +initial_pg_fixmap: + .fill 1024,4,0 +.globl swapper_pg_dir + .align PGD_ALIGN +swapper_pg_dir: + .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 +.globl empty_zero_page +empty_zero_page: + .fill 4096,1,0 +EXPORT_SYMBOL(empty_zero_page) + +/* + * This starts the data section. + */ +#ifdef CONFIG_X86_PAE +__PAGE_ALIGNED_DATA + /* Page-aligned for the benefit of paravirt? */ + .align PGD_ALIGN +SYM_DATA_START(initial_page_table) + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + +SYM_DATA_END(initial_page_table) +#endif + +.data +.balign 4 +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, + .long init_thread_union + THREAD_SIZE - + SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) + +__INITRODATA +int_msg: + .asciz "Unknown interrupt or fault at: %p %p %p\n" + +#include "../../x86/xen/xen-head.S" + +/* + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: + */ + + .data + ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +SYM_DATA_START_LOCAL(boot_gdt_descr) + .word __BOOT_DS+7 + .long boot_gdt - __PAGE_OFFSET +SYM_DATA_END(boot_gdt_descr) + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +SYM_DATA_START(early_gdt_descr) + .word GDT_ENTRIES*8-1 + .long gdt_page /* Overwritten for secondary CPUs */ +SYM_DATA_END(early_gdt_descr) + +/* + * The boot_gdt must mirror the equivalent in setup.S and is + * used only for booting. + */ + .align L1_CACHE_BYTES +SYM_DATA_START(boot_gdt) + .fill GDT_ENTRY_BOOT_CS,8,0 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +SYM_DATA_END(boot_gdt) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S new file mode 100644 index 000000000..0424c2a6c --- /dev/null +++ b/arch/x86/kernel/head_64.S @@ -0,0 +1,603 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../entry/calling.h" +#include +#include +#include + +#ifdef CONFIG_PARAVIRT_XXL +#include +#include +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg +#else +#define INTERRUPT_RETURN iretq +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg +#endif + +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE + * because we need identity-mapped pages. + */ +#define l4_index(x) (((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + +L3_START_KERNEL = pud_index(__START_KERNEL_map) + + .text + __HEAD + .code64 +SYM_CODE_START_NOALIGN(startup_64) + UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %rsi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86/boot/compressed/head_64.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. + */ + + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + + leaq _text(%rip), %rdi + pushq %rsi + call startup_64_setup_env + popq %rsi + + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_EMPTY + + /* Sanitize CPU configuration */ + call verify_cpu + + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ + leaq _text(%rip), %rdi + pushq %rsi + call __startup_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax + jmp 1f +SYM_CODE_END(startup_64) + +SYM_CODE_START(secondary_startup_64) + UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded a mapped page table. + * + * %rsi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. + */ + + /* Sanitize CPU configuration */ + call verify_cpu + + /* + * The secondary_startup_64_no_verify entry point is only used by + * SEV-ES guests. In those guests the call to verify_cpu() would cause + * #VC exceptions which can not be handled at this stage of secondary + * CPU bringup. + * + * All non SEV-ES systems, especially Intel systems, need to execute + * verify_cpu() above to make sure NX is enabled. + */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax +1: + + /* Enable PAE mode, PGE and LA57 */ + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx +#ifdef CONFIG_X86_5LEVEL + testl $1, __pgtable_l5_enabled(%rip) + jz 1f + orl $X86_CR4_LA57, %ecx +1: +#endif + movq %rcx, %cr4 + + /* Setup early boot stage 4-/5-level pagetables. */ + addq phys_base(%rip), %rax + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + * %rsi carries pointer to realmode data and is callee-clobbered. Save + * and restore it. + */ + pushq %rsi + movq %rax, %rdi + call sev_verify_cbit + popq %rsi + + /* Switch to new page-table */ + movq %rax, %cr3 + + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +1: + UNWIND_HINT_EMPTY + + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt early_gdt_descr(%rip) + + /* set up data segments */ + xorl %eax,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + + /* Set up %gs. + * + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx + wrmsr + + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack + */ + movq initial_stack(%rip), %rsp + + /* Setup and Load IDT */ + pushq %rsi + call early_setup_idt + popq %rsi + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) +1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* rsi is pointer to real mode structure with interesting info. + pass it to C */ + movq %rsi, %rdi + +.Ljump_to_C_code: + /* + * Jump to run C code and to be on a real kernel address. + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. + */ + pushq $.Lafter_lret # put return address on stack for unwinder + xorl %ebp, %ebp # clear frame pointer + movq initial_code(%rip), %rax + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +.Lafter_lret: +SYM_CODE_END(secondary_startup_64) + +#include "verify_cpu.S" +#include "sev_verify_cbit.S" + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary() via .Ljump_to_C_code. + */ +SYM_CODE_START(start_cpu0) + UNWIND_HINT_EMPTY + movq initial_stack(%rip), %rsp + jmp .Ljump_to_C_code +SYM_CODE_END(start_cpu0) +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + ANNOTATE_UNRET_END + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + movq initial_vc_handler(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_boot_ghcb) +#endif + + /* Both SMP bootup and ACPI suspend change these variables */ + __REFDATA + .balign 8 +SYM_DATA(initial_code, .quad x86_64_start_kernel) +SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) +#endif + +/* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * reliably detect the end of the stack. + */ +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) + __FINITDATA + + __INIT +SYM_CODE_START(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 + .endif + pushq $i # 72(%rsp) Vector number + jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr + UNWIND_HINT_IRET_REGS offset=16 +SYM_CODE_END(early_idt_handler_array) + +SYM_CODE_START_LOCAL(early_idt_handler_common) + ANNOTATE_UNRET_END + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ + cld + + incl early_recursion_flag(%rip) + + /* The vector number is currently in the pt_regs->di slot. */ + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* RSI = vector number */ + movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->bx */ + pushq %rbp /* pt_regs->bp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS + + movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ + call do_early_exception + + decl early_recursion_flag(%rip) + jmp restore_regs_and_return_to_kernel +SYM_CODE_END(early_idt_handler_common) + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + ANNOTATE_UNRET_END + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + call do_vc_no_ghcb + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_no_ghcb) +#endif + +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE) +#else +#define SYM_DATA_START_PTI_ALIGNED(name) \ + SYM_DATA_START_PAGE_ALIGNED(name) +#define PTI_USER_PGD_FILL 0 +#endif + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ + .endr + + __INITDATA + .balign 4 + +SYM_DATA_START_PTI_ALIGNED(early_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(early_top_pgt) + +SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +SYM_DATA_END(early_dynamic_pgts) + +SYM_DATA(early_recursion_flag, .long 0) + + .data + +#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH) +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .fill 511, 8, 0 +SYM_DATA_END(level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt) + /* + * Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + * + * Note: This sets _PAGE_GLOBAL despite whether + * the CPU supports it or it is enabled. But, + * the CPU should ignore the bit. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +SYM_DATA_END(level2_ident_pgt) +#else +SYM_DATA_START_PTI_ALIGNED(init_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 +SYM_DATA_END(init_top_pgt) +#endif + +#ifdef CONFIG_X86_5LEVEL +SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level4_kernel_pgt) +#endif + +SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(level3_kernel_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) + /* + * 512 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +512MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + * + * This table is eventually used by the kernel during normal + * runtime. Care must be taken to clear out undesired bits + * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, + KERNEL_IMAGE_SIZE/PMD_SIZE) +SYM_DATA_END(level2_kernel_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 +SYM_DATA_END(level2_fixmap_pgt) + +SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) + .fill 512,8,0 + .endr +SYM_DATA_END(level1_fixmap_pgt) + +#undef PMDS + + .data + .align 16 + +SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) +SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) + + .align 16 +/* This must match the first entry in level2_kernel_pgt */ +SYM_DATA(phys_base, .quad 0x0) +EXPORT_SYMBOL(phys_base) + +#include "../../x86/xen/xen-head.S" + + __PAGE_ALIGNED_BSS +SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) + .skip PAGE_SIZE +SYM_DATA_END(empty_zero_page) +EXPORT_SYMBOL(empty_zero_page) + diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c new file mode 100644 index 000000000..574df24a8 --- /dev/null +++ b/arch/x86/kernel/hpet.c @@ -0,0 +1,1361 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "hpet: " fmt + +enum hpet_mode { + HPET_MODE_UNUSED, + HPET_MODE_LEGACY, + HPET_MODE_CLOCKEVT, + HPET_MODE_DEVICE, +}; + +struct hpet_channel { + struct clock_event_device evt; + unsigned int num; + unsigned int cpu; + unsigned int irq; + unsigned int in_use; + enum hpet_mode mode; + unsigned int boot_cfg; + char name[10]; +}; + +struct hpet_base { + unsigned int nr_channels; + unsigned int nr_clockevents; + unsigned int boot_cfg; + struct hpet_channel *channels; +}; + +#define HPET_MASK CLOCKSOURCE_MASK(32) + +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ +bool hpet_msi_disable; + +#ifdef CONFIG_PCI_MSI +static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); +static struct irq_domain *hpet_domain; +#endif + +static void __iomem *hpet_virt_address; + +static struct hpet_base hpet_base; + +static bool hpet_legacy_int_enabled; +static unsigned long hpet_freq; + +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; + +static inline +struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt) +{ + return container_of(evt, struct hpet_channel, evt); +} + +inline unsigned int hpet_readl(unsigned int a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned int d, unsigned int a) +{ + writel(d, hpet_virt_address + a); +} + +static inline void hpet_set_mapping(void) +{ + hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE); +} + +static inline void hpet_clear_mapping(void) +{ + iounmap(hpet_virt_address); + hpet_virt_address = NULL; +} + +/* + * HPET command line enable / disable + */ +static int __init hpet_setup(char *str) +{ + while (str) { + char *next = strchr(str, ','); + + if (next) + *next++ = 0; + if (!strncmp("disable", str, 7)) + boot_hpet_disable = true; + if (!strncmp("force", str, 5)) + hpet_force_user = true; + if (!strncmp("verbose", str, 7)) + hpet_verbose = true; + str = next; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static int __init disable_hpet(char *str) +{ + boot_hpet_disable = true; + return 1; +} +__setup("nohpet", disable_hpet); + +static inline int is_hpet_capable(void) +{ + return !boot_hpet_disable && hpet_address; +} + +/** + * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} +EXPORT_SYMBOL_GPL(is_hpet_enabled); + +static void _hpet_print_config(const char *function, int line) +{ + u32 i, id, period, cfg, status, channels, l, h; + + pr_info("%s(%d):\n", function, line); + + id = hpet_readl(HPET_ID); + period = hpet_readl(HPET_PERIOD); + pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period); + + cfg = hpet_readl(HPET_CFG); + status = hpet_readl(HPET_STATUS); + pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status); + + l = hpet_readl(HPET_COUNTER); + h = hpet_readl(HPET_COUNTER+4); + pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + for (i = 0; i < channels; i++) { + l = hpet_readl(HPET_Tn_CFG(i)); + h = hpet_readl(HPET_Tn_CFG(i)+4); + pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h); + + l = hpet_readl(HPET_Tn_CMP(i)); + h = hpet_readl(HPET_Tn_CMP(i)+4); + pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h); + + l = hpet_readl(HPET_Tn_ROUTE(i)); + h = hpet_readl(HPET_Tn_ROUTE(i)+4); + pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h); + } +} + +#define hpet_print_config() \ +do { \ + if (hpet_verbose) \ + _hpet_print_config(__func__, __LINE__); \ +} while (0) + +/* + * When the HPET driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET + +static void __init hpet_reserve_platform_timers(void) +{ + struct hpet_data hd; + unsigned int i; + + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = hpet_base.nr_channels; + + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (i >= 2) + hd.hd_irq[i] = hc->irq; + + switch (hc->mode) { + case HPET_MODE_UNUSED: + case HPET_MODE_DEVICE: + hc->mode = HPET_MODE_DEVICE; + break; + case HPET_MODE_CLOCKEVT: + case HPET_MODE_LEGACY: + hpet_reserve_timer(&hd, hc->num); + break; + } + } + + hpet_alloc(&hd); +} + +static void __init hpet_select_device_channel(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + /* Associate the first unused channel to /dev/hpet */ + if (hc->mode == HPET_MODE_UNUSED) { + hc->mode = HPET_MODE_DEVICE; + return; + } + } +} + +#else +static inline void hpet_reserve_platform_timers(void) { } +static inline void hpet_select_device_channel(void) {} +#endif + +/* Common HPET functions */ +static void hpet_stop_counter(void) +{ + u32 cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_reset_counter(void) +{ + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); +} + +static void hpet_start_counter(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_restart_counter(void) +{ + hpet_stop_counter(); + hpet_reset_counter(); + hpet_start_counter(); +} + +static void hpet_resume_device(void) +{ + force_hpet_resume(); +} + +static void hpet_resume_counter(struct clocksource *cs) +{ + hpet_resume_device(); + hpet_restart_counter(); +} + +static void hpet_enable_legacy_int(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = true; +} + +static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg, cmp, now; + uint64_t delta; + + hpet_stop_counter(); + delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult; + delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned int)delta; + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + hpet_writel(cmp, HPET_Tn_CMP(channel)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ + hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel)); + hpet_start_counter(); + hpet_print_config(); + + return 0; +} + +static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + + return 0; +} + +static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(channel)); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_Tn_CFG(channel)); + + return 0; +} + +static int hpet_clkevt_legacy_resume(struct clock_event_device *evt) +{ + hpet_enable_legacy_int(); + hpet_print_config(); + return 0; +} + +static int +hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt) +{ + unsigned int channel = clockevent_to_channel(evt)->num; + u32 cnt; + s32 res; + + cnt = hpet_readl(HPET_COUNTER); + cnt += (u32) delta; + hpet_writel(cnt, HPET_Tn_CMP(channel)); + + /* + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than HPET_MIN_CYCLES + * away from the event or if the counter is already ahead of + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. + */ + res = (s32)(cnt - hpet_readl(HPET_COUNTER)); + + return res < HPET_MIN_CYCLES ? -ETIME : 0; +} + +static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating) +{ + struct clock_event_device *evt = &hc->evt; + + evt->rating = rating; + evt->irq = hc->irq; + evt->name = hc->name; + evt->cpumask = cpumask_of(hc->cpu); + evt->set_state_oneshot = hpet_clkevt_set_state_oneshot; + evt->set_next_event = hpet_clkevt_set_next_event; + evt->set_state_shutdown = hpet_clkevt_set_state_shutdown; + + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hc->boot_cfg & HPET_TN_PERIODIC) { + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_clkevt_set_state_periodic; + } +} + +static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) +{ + /* + * Start HPET with the boot CPU's cpumask and make it global after + * the IO_APIC has been initialized. + */ + hc->cpu = boot_cpu_data.cpu_index; + strncpy(hc->name, "hpet", sizeof(hc->name)); + hpet_init_clockevent(hc, 50); + + hc->evt.tick_resume = hpet_clkevt_legacy_resume; + + /* + * Legacy horrors and sins from the past. HPET used periodic mode + * unconditionally forever on the legacy channel 0. Removing the + * below hack and using the conditional in hpet_init_clockevent() + * makes at least Qemu and one hardware machine fail to boot. + * There are two issues which cause the boot failure: + * + * #1 After the timer delivery test in IOAPIC and the IOAPIC setup + * the next interrupt is not delivered despite the HPET channel + * being programmed correctly. Reprogramming the HPET after + * switching to IOAPIC makes it work again. After fixing this, + * the next issue surfaces: + * + * #2 Due to the unconditional periodic mode availability the Local + * APIC timer calibration can hijack the global clockevents + * event handler without causing damage. Using oneshot at this + * stage makes if hang because the HPET does not get + * reprogrammed due to the handler hijacking. Duh, stupid me! + * + * Both issues require major surgery and especially the kick HPET + * again after enabling IOAPIC results in really nasty hackery. + * This 'assume periodic works' magic has survived since HPET + * support got added, so it's questionable whether this should be + * fixed. Both Qemu and the failing hardware machine support + * periodic mode despite the fact that both don't advertise it in + * the configuration register and both need that extra kick after + * switching to IOAPIC. Seems to be a feature... + */ + hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC; + hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + clockevents_config_and_register(&hc->evt, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); + global_clock_event = &hc->evt; + pr_debug("Clockevent registered\n"); +} + +/* + * HPET MSI Support + */ +#ifdef CONFIG_PCI_MSI + +void hpet_msi_unmask(struct irq_data *data) +{ + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); +} + +void hpet_msi_mask(struct irq_data *data) +{ + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); +} + +void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +{ + hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); +} + +static int hpet_clkevt_msi_resume(struct clock_event_device *evt) +{ + struct hpet_channel *hc = clockevent_to_channel(evt); + struct irq_data *data = irq_get_irq_data(hc->irq); + struct msi_msg msg; + + /* Restore the MSI msg and unmask the interrupt */ + irq_chip_compose_msi_msg(data, &msg); + hpet_msi_write(hc, &msg); + hpet_msi_unmask(data); + return 0; +} + +static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data) +{ + struct hpet_channel *hc = data; + struct clock_event_device *evt = &hc->evt; + + if (!evt->event_handler) { + pr_info("Spurious interrupt HPET channel %d\n", hc->num); + return IRQ_HANDLED; + } + + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static int hpet_setup_msi_irq(struct hpet_channel *hc) +{ + if (request_irq(hc->irq, hpet_msi_interrupt_handler, + IRQF_TIMER | IRQF_NOBALANCING, + hc->name, hc)) + return -1; + + disable_irq(hc->irq); + irq_set_affinity(hc->irq, cpumask_of(hc->cpu)); + enable_irq(hc->irq); + + pr_debug("%s irq %u for MSI\n", hc->name, hc->irq); + + return 0; +} + +/* Invoked from the hotplug callback on @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu) +{ + struct clock_event_device *evt = &hc->evt; + + hc->cpu = cpu; + per_cpu(cpu_hpet_channel, cpu) = hc; + hpet_setup_msi_irq(hc); + + hpet_init_clockevent(hc, 110); + evt->tick_resume = hpet_clkevt_msi_resume; + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); +} + +static struct hpet_channel *hpet_get_unused_clockevent(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use) + continue; + hc->in_use = 1; + return hc; + } + return NULL; +} + +static int hpet_cpuhp_online(unsigned int cpu) +{ + struct hpet_channel *hc = hpet_get_unused_clockevent(); + + if (hc) + init_one_hpet_msi_clockevent(hc, cpu); + return 0; +} + +static int hpet_cpuhp_dead(unsigned int cpu) +{ + struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu); + + if (!hc) + return 0; + free_irq(hc->irq, hc); + hc->in_use = 0; + per_cpu(cpu_hpet_channel, cpu) = NULL; + return 0; +} + +static void __init hpet_select_clockevents(void) +{ + unsigned int i; + + hpet_base.nr_clockevents = 0; + + /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) + return; + + hpet_print_config(); + + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + int irq; + + if (hc->mode != HPET_MODE_UNUSED) + continue; + + /* Only consider HPET channel with MSI support */ + if (!(hc->boot_cfg & HPET_TN_FSB_CAP)) + continue; + + sprintf(hc->name, "hpet%d", i); + + irq = hpet_assign_irq(hpet_domain, hc, hc->num); + if (irq <= 0) + continue; + + hc->irq = irq; + hc->mode = HPET_MODE_CLOCKEVT; + + if (++hpet_base.nr_clockevents == num_possible_cpus()) + break; + } + + pr_info("%d channels of %d reserved for per-cpu timers\n", + hpet_base.nr_channels, hpet_base.nr_clockevents); +} + +#else + +static inline void hpet_select_clockevents(void) { } + +#define hpet_cpuhp_online NULL +#define hpet_cpuhp_dead NULL + +#endif + +/* + * Clock source related code + */ +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +/* + * Reading the HPET counter is a very slow operation. If a large number of + * CPUs are trying to access the HPET counter simultaneously, it can cause + * massive delays and slow down system performance dramatically. This may + * happen when HPET is the default clock source instead of TSC. For a + * really large system with hundreds of CPUs, the slowdown may be so + * severe, that it can actually crash the system because of a NMI watchdog + * soft lockup, for example. + * + * If multiple CPUs are trying to access the HPET counter at the same time, + * we don't actually need to read the counter multiple times. Instead, the + * other CPUs can use the counter value read by the first CPU in the group. + * + * This special feature is only enabled on x86-64 systems. It is unlikely + * that 32-bit x86 systems will have enough CPUs to require this feature + * with its associated locking overhead. We also need 64-bit atomic read. + * + * The lock and the HPET value are stored together and can be read in a + * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t + * is 32 bits in size. + */ +union hpet_lock { + struct { + arch_spinlock_t lock; + u32 value; + }; + u64 lockval; +}; + +static union hpet_lock hpet __cacheline_aligned = { + { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, +}; + +static u64 read_hpet(struct clocksource *cs) +{ + unsigned long flags; + union hpet_lock old, new; + + BUILD_BUG_ON(sizeof(union hpet_lock) != 8); + + /* + * Read HPET directly if in NMI. + */ + if (in_nmi()) + return (u64)hpet_readl(HPET_COUNTER); + + /* + * Read the current state of the lock and HPET value atomically. + */ + old.lockval = READ_ONCE(hpet.lockval); + + if (arch_spin_is_locked(&old.lock)) + goto contended; + + local_irq_save(flags); + if (arch_spin_trylock(&hpet.lock)) { + new.value = hpet_readl(HPET_COUNTER); + /* + * Use WRITE_ONCE() to prevent store tearing. + */ + WRITE_ONCE(hpet.value, new.value); + arch_spin_unlock(&hpet.lock); + local_irq_restore(flags); + return (u64)new.value; + } + local_irq_restore(flags); + +contended: + /* + * Contended case + * -------------- + * Wait until the HPET value change or the lock is free to indicate + * its value is up-to-date. + * + * It is possible that old.value has already contained the latest + * HPET value while the lock holder was in the process of releasing + * the lock. Checking for lock state change will enable us to return + * the value immediately instead of waiting for the next HPET reader + * to come along. + */ + do { + cpu_relax(); + new.lockval = READ_ONCE(hpet.lockval); + } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); + + return (u64)new.value; +} +#else +/* + * For UP or 32-bit. + */ +static u64 read_hpet(struct clocksource *cs) +{ + return (u64)hpet_readl(HPET_COUNTER); +} +#endif + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_resume_counter, +}; + +/* + * AMD SB700 based systems with spread spectrum enabled use a SMM based + * HPET emulation to provide proper frequency setting. + * + * On such systems the SMM code is initialized with the first HPET register + * access and takes some time to complete. During this time the config + * register reads 0xffffffff. We check for max 1000 loops whether the + * config register reads a non-0xffffffff value to make sure that the + * HPET is up and running before we proceed any further. + * + * A counting loop is safe, as the HPET access takes thousands of CPU cycles. + * + * On non-SB700 based machines this check is only done once and has no + * side effects. + */ +static bool __init hpet_cfg_working(void) +{ + int i; + + for (i = 0; i < 1000; i++) { + if (hpet_readl(HPET_CFG) != 0xFFFFFFFF) + return true; + } + + pr_warn("Config register invalid. Disabling HPET\n"); + return false; +} + +static bool __init hpet_counting(void) +{ + u64 start, now, t1; + + hpet_restart_counter(); + + t1 = hpet_readl(HPET_COUNTER); + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + if (t1 != hpet_readl(HPET_COUNTER)) + return true; + now = rdtsc(); + } while ((now - start) < 200000UL); + + pr_warn("Counter not counting. HPET disabled\n"); + return false; +} + +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. + */ +int __init hpet_enable(void) +{ + u32 hpet_period, cfg, id, irq; + unsigned int i, channels; + struct hpet_channel *hc; + u64 freq; + + if (!is_hpet_capable()) + return 0; + + if (hpet_is_pc10_damaged()) + return 0; + + hpet_set_mapping(); + if (!hpet_virt_address) + return 0; + + /* Validate that the config register is working */ + if (!hpet_cfg_working()) + goto out_nohpet; + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* The period is a femtoseconds value. Convert it to a frequency. */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + hpet_print_config(); + + /* This is the HPET channel number which is zero based */ + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2) + goto out_nohpet; + + hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL); + if (!hc) { + pr_warn("Disabling HPET.\n"); + goto out_nohpet; + } + hpet_base.channels = hc; + hpet_base.nr_channels = channels; + + /* Read, store and sanitize the global configuration */ + cfg = hpet_readl(HPET_CFG); + hpet_base.boot_cfg = cfg; + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + if (cfg) + pr_warn("Global config: Unknown bits %#x\n", cfg); + + /* Read, store and sanitize the per channel configuration */ + for (i = 0; i < channels; i++, hc++) { + hc->num = i; + + cfg = hpet_readl(HPET_Tn_CFG(i)); + hc->boot_cfg = cfg; + irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; + hc->irq = irq; + + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(i)); + + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP + | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE + | HPET_TN_FSB | HPET_TN_FSB_CAP); + if (cfg) + pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg); + } + hpet_print_config(); + + /* + * Validate that the counter is counting. This needs to be done + * after sanitizing the config registers to properly deal with + * force enabled HPETs. + */ + if (!hpet_counting()) + goto out_nohpet; + + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); + + if (id & HPET_ID_LEGSUP) { + hpet_legacy_clockevent_register(&hpet_base.channels[0]); + hpet_base.channels[0].mode = HPET_MODE_LEGACY; + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC)) + hpet_base.channels[1].mode = HPET_MODE_LEGACY; + return 1; + } + return 0; + +out_nohpet: + kfree(hpet_base.channels); + hpet_base.channels = NULL; + hpet_base.nr_channels = 0; + hpet_clear_mapping(); + hpet_address = 0; + return 0; +} + +/* + * The late initialization runs after the PCI quirks have been invoked + * which might have detected a system on which the HPET can be enforced. + * + * Also, the MSI machinery is not working yet when the HPET is initialized + * early. + * + * If the HPET is enabled, then: + * + * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y + * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents + * 3) Setup /dev/hpet if CONFIG_HPET=y + * 4) Register hotplug callbacks when clockevents are available + */ +static __init int hpet_late_init(void) +{ + int ret; + + if (!hpet_address) { + if (!force_hpet_address) + return -ENODEV; + + hpet_address = force_hpet_address; + hpet_enable(); + } + + if (!hpet_virt_address) + return -ENODEV; + + hpet_select_device_channel(); + hpet_select_clockevents(); + hpet_reserve_platform_timers(); + hpet_print_config(); + + if (!hpet_base.nr_clockevents) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", + hpet_cpuhp_online, NULL); + if (ret) + return ret; + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL, + hpet_cpuhp_dead); + if (ret) + goto err_cpuhp; + return 0; + +err_cpuhp: + cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); + return ret; +} +fs_initcall(hpet_late_init); + +void hpet_disable(void) +{ + unsigned int i; + u32 cfg; + + if (!is_hpet_capable() || !hpet_virt_address) + return; + + /* Restore boot configuration with the enable bit cleared */ + cfg = hpet_base.boot_cfg; + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + /* Restore the channel boot configuration */ + for (i = 0; i < hpet_base.nr_channels; i++) + hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i)); + + /* If the HPET was enabled at boot time, reenable it */ + if (hpet_base.boot_cfg & HPET_CFG_ENABLE) + hpet_writel(hpet_base.boot_cfg, HPET_CFG); +} + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* + * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET + * is enabled, we support RTC interrupt functionality in software. + * + * RTC has 3 kinds of interrupts: + * + * 1) Update Interrupt - generate an interrupt, every second, when the + * RTC clock is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2) + * + * (1) and (2) above are implemented using polling at a frequency of 64 Hz: + * DEFAULT_RTC_INT_FREQ. + * + * The exact frequency is a tradeoff between accuracy and interrupt overhead. + * + * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency, + * if it's higher. + */ +#include +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static int hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static u32 hpet_t1_cmp; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; +static unsigned long hpet_pie_limit; + +static rtc_irq_handler irq_handler; + +/* + * Check that the HPET counter c1 is ahead of c2 + */ +static inline int hpet_cnt_ahead(u32 c1, u32 c2) +{ + return (s32)(c2 - c1) < 0; +} + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + +/* + * Channel 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for channel 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt, delta; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + uint64_t clc; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + clc >>= evt->shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); + +static void hpet_disable_rtc_channel(void) +{ + u32 cfg = hpet_readl(HPET_T1_CFG); + + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) { + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + } else { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= evt->shift; + hpet_pie_delta = clc; + hpet_pie_limit = 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER))); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + pr_warn("Lost %d RTC interrupts\n", lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_AIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); +#endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000..668a4a653 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + lockdep_assert_irqs_disabled(); + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __this_cpu_write(cpu_debugreg[i], info->address); + + dr7 = this_cpu_ptr(&cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + set_debugreg(*dr7, 7); + if (info->mask) + set_dr_addr_mask(info->mask, i); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long dr7; + int i; + + lockdep_assert_irqs_disabled(); + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(dr7, 7); + if (info->mask) + set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); +} + +static int arch_bp_generic_len(int x86_len) +{ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + int len; + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; + *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + len = arch_bp_generic_len(x86_len); + if (len < 0) + return -EINVAL; + *gen_len = len; + + return 0; +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) +{ + unsigned long va; + int len; + + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + +/* + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. + */ +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) +{ + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE)) + return true; + + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; + } + + return false; +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry(attr->bp_addr, bp_end)) + return -EINVAL; + + hw->address = attr->bp_addr; + hw->mask = 0; + + /* Type */ + switch (attr->bp_type) { + case HW_BREAKPOINT_W: + hw->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + hw->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + /* + * We don't allow kernel breakpoints in places that are not + * acceptable for kprobes. On non-kprobes kernels, we don't + * allow kernel breakpoints at all. + */ + if (attr->bp_addr >= TASK_SIZE_MAX) { + if (within_kprobe_blacklist(attr->bp_addr)) + return -EINVAL; + } + + hw->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; + return 0; + } + fallthrough; + default: + return -EINVAL; + } + + /* Len */ + switch (attr->bp_len) { + case HW_BREAKPOINT_LEN_1: + hw->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + hw->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + hw->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + hw->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + /* AMD range breakpoint */ + if (!is_power_of_2(attr->bp_len)) + return -EINVAL; + if (attr->bp_addr & (attr->bp_len - 1)) + return -EINVAL; + + if (!boot_cpu_has(X86_FEATURE_BPEXT)) + return -EOPNOTSUPP; + + /* + * It's impossible to use a range breakpoint to fake out + * user vs kernel detection because bp_len - 1 can't + * have the high bit set. If we ever allow range instruction + * breakpoints, then we'll have to check for kprobe-blacklisted + * addresses anywhere in the range. + */ + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; + } + + return 0; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp, attr, hw); + if (ret) + return ret; + + switch (hw->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + if (hw->mask) + align = hw->mask; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (hw->address & align) + return -EINVAL; + + return 0; +} + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } + + t->virtual_dr6 = 0; + t->ptrace_dr7 = 0; +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); + set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); + set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); + set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); + set_debugreg(DR6_RESERVED, 6); + set_debugreg(__this_cpu_read(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int hw_breakpoint_handler(struct die_args *args) +{ + int i, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long *dr6_p; + unsigned long dr6; + bool bpx; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + + /* + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. + */ + if (bpx && (dr6 & DR_STEP)) + continue; + + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + + perf_bp_event(bp, args->regs); + + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bpx) + args->regs->flags |= X86_EFLAGS_RF; + } + + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) + rc = NOTIFY_DONE; + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c new file mode 100644 index 000000000..2cd124ad9 --- /dev/null +++ b/arch/x86/kernel/i8237.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * 8237A DMA controller suspend functions. + * + * Written by Pierre Ossman, 2005. + */ + +#include +#include +#include + +#include +#include + +/* + * This module just handles suspend/resume issues with the + * 8237A DMA controller (used for ISA and LPC). + * Allocation is handled in kernel/dma.c and normal usage is + * in asm/dma.h. + */ + +static void i8237A_resume(void) +{ + unsigned long flags; + int i; + + flags = claim_dma_lock(); + + dma_outb(0, DMA1_RESET_REG); + dma_outb(0, DMA2_RESET_REG); + + for (i = 0; i < 8; i++) { + set_dma_addr(i, 0x000000); + /* DMA count is a bit weird so this is not 0 */ + set_dma_count(i, 1); + } + + /* Enable cascade DMA or channel 0-3 won't work */ + enable_dma(4); + + release_dma_lock(flags); +} + +static struct syscore_ops i8237_syscore_ops = { + .resume = i8237A_resume, +}; + +static int __init i8237A_init_ops(void) +{ + /* + * From SKL PCH onwards, the legacy DMA device is removed in which the + * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed + * as well. All removed ports must return 0xff for a inb() request. + * + * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting + * the presence of DMA device since it may be used by BIOS to decode + * LPC traffic for POST codes. Original LPC only decodes one byte of + * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x + * decoding. + */ + if (dma_inb(DMA_PAGE_0) == 0xFF) + return -ENODEV; + + /* + * It is not required to load this driver as newer SoC may not + * support 8237 DMA or bus mastering from LPC. Platform firmware + * must announce the support for such legacy devices via + * ACPI_FADT_LEGACY_DEVICES field in FADT table. + */ + if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017) + return -ENODEV; + + register_syscore_ops(&i8237_syscore_ops); + return 0; +} +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c new file mode 100644 index 000000000..2b7999a1a --- /dev/null +++ b/arch/x86/kernel/i8253.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * 8253/PIT functions + * + */ +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Modern chipsets can disable the PIT clock which makes it unusable. It + * would be possible to enable the clock but the registers are chipset + * specific and not discoverable. Avoid the whack a mole game. + * + * These platforms have discoverable TSC/CPU frequencies but this also + * requires to know the local APIC timer frequency as it normally is + * calibrated against the PIT interrupt. + */ +static bool __init use_pit(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC)) + return true; + + /* This also returns true when APIC is disabled */ + return apic_needs_pit(); +} + +bool __init pit_timer_init(void) +{ + if (!use_pit()) + return false; + + clockevent_i8253_init(true); + global_clock_event = &i8253_clockevent; + return true; +} + +#ifndef CONFIG_X86_64 +static int __init init_pit_clocksource(void) +{ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + !clockevent_state_periodic(&i8253_clockevent)) + return 0; + + return clocksource_i8253_init(); +} +arch_initcall(init_pit_clocksource); +#endif /* !CONFIG_X86_64 */ diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 000000000..4c9f559e1 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ +static void init_8259A(int auto_eoi); + +static bool pcat_compat __ro_after_init; +static int i8259A_auto_eoi; +DEFINE_RAW_SPINLOCK(i8259A_lock); + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +static void mask_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void disable_8259A_irq(struct irq_data *data) +{ + mask_8259A_irq(data->irq); +} + +static void unmask_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void enable_8259A_irq(struct irq_data *data) +{ + unmask_8259A_irq(data->irq); +} + +static int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +static void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(struct irq_data *data) +{ + unsigned int irq = data->irq; + unsigned int irqmask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7), PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ + } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk_deferred(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .irq_mask = disable_8259A_irq, + .irq_disable = disable_8259A_irq, + .irq_unmask = enable_8259A_irq, + .irq_mask_ack = mask_and_ack_8259A, +}; + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static void i8259A_resume(void) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); +} + +static int i8259A_suspend(void) +{ + save_ELCR(irq_trigger); + return 0; +} + +static void i8259A_shutdown(void) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ +} + +static struct syscore_ops i8259_syscore_ops = { + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static void mask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void unmask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static int probe_8259A(void) +{ + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); + unsigned long flags; + + /* + * If MADT has the PCAT_COMPAT flag set, then do not bother probing + * for the PIC. Some BIOSes leave the PIC uninitialized and probing + * fails. + * + * Right now this causes problems as quite some code depends on + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly + * when the system has an IO/APIC because then PIC is not required + * at all, except for really old machines where the timer interrupt + * must be routed through the PIC. So just pretend that the PIC is + * there and let legacy_pic->init() initialize it for nothing. + * + * Alternatively this could just try to initialize the PIC and + * repeat the probe, but for cases where there is no PIC that's + * just pointless. + */ + if (pcat_compat) + return nr_legacy_irqs(); + + /* + * Check to see if we have a PIC. Mask all except the cascade and + * read back the value we just wrote. If we don't have a PIC, we + * will read 0xff as opposed to the value we wrote. + */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + outb(probe_val, PIC_MASTER_IMR); + new_val = inb(PIC_MASTER_IMR); + if (new_val != probe_val) { + printk(KERN_INFO "Using NULL legacy PIC\n"); + legacy_pic = &null_legacy_pic; + } + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); + + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.irq_mask_ack = disable_8259A_irq; + else + i8259A_chip.irq_mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} +static int legacy_pic_probe(void) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_irq_chip, + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask = mask_8259A_irq, + .unmask = unmask_8259A_irq, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .probe = probe_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); + +static int __init i8259A_init_ops(void) +{ + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); + + return 0; +} +device_initcall(i8259A_init_ops); + +void __init legacy_pic_pcat_compat(void) +{ + pcat_compat = true; +} diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000..ee1a283f8 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interrupt descriptor table related code + */ +#include + +#include +#include +#include +#include +#include +#include + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) + +static bool idt_setup_done __initdata; + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, asm_exc_debug), + SYSG(X86_TRAP_BP, asm_exc_int3), + +#ifdef CONFIG_X86_32 + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ + INTG(X86_TRAP_PF, asm_exc_page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initconst struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, asm_exc_divide_error), + INTG(X86_TRAP_NMI, asm_exc_nmi), + INTG(X86_TRAP_BR, asm_exc_bounds), + INTG(X86_TRAP_UD, asm_exc_invalid_op), + INTG(X86_TRAP_NM, asm_exc_device_not_available), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), + INTG(X86_TRAP_SS, asm_exc_stack_segment), + INTG(X86_TRAP_GP, asm_exc_general_protection), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), + INTG(X86_TRAP_AC, asm_exc_alignment_check), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, asm_exc_double_fault), +#endif + INTG(X86_TRAP_DB, asm_exc_debug), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, asm_exc_machine_check), +#endif + + SYSG(X86_TRAP_OF, asm_exc_overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initconst struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), +# endif + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +#endif +}; + +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +static struct desc_ptr idt_descr __ro_after_init = { + .size = IDT_TABLE_SIZE - 1, + .address = (unsigned long) idt_table, +}; + +void load_current_idt(void) +{ + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} + +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; +} +#endif + +static __init void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, system_vectors); + } +} + +static __init void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + init_idt_data(&data, n, addr); + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initconst struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), +#endif +}; + +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory intialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} +#endif + +static void __init idt_map_in_cea(void) +{ + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; +} + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + +#ifdef CONFIG_X86_LOCAL_APIC + for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ + entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); + set_intr_gate(i, entry); + } +#endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + + idt_setup_done = true; +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init alloc_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c new file mode 100644 index 000000000..bd218470d --- /dev/null +++ b/arch/x86/kernel/ima_arch.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2018 IBM Corporation + */ +#include +#include +#include + +extern struct boot_params boot_params; + +static enum efi_secureboot_mode get_sb_mode(void) +{ + efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; + efi_status_t status; + unsigned long size; + u8 secboot, setupmode; + + size = sizeof(secboot); + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + pr_info("ima: secureboot mode unknown, no efi\n"); + return efi_secureboot_mode_unknown; + } + + /* Get variable contents into buffer */ + status = efi.get_variable(L"SecureBoot", &efi_variable_guid, + NULL, &size, &secboot); + if (status == EFI_NOT_FOUND) { + pr_info("ima: secureboot mode disabled\n"); + return efi_secureboot_mode_disabled; + } + + if (status != EFI_SUCCESS) { + pr_info("ima: secureboot mode unknown\n"); + return efi_secureboot_mode_unknown; + } + + size = sizeof(setupmode); + status = efi.get_variable(L"SetupMode", &efi_variable_guid, + NULL, &size, &setupmode); + + if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ + setupmode = 0; + + if (secboot == 0 || setupmode == 1) { + pr_info("ima: secureboot mode disabled\n"); + return efi_secureboot_mode_disabled; + } + + pr_info("ima: secureboot mode enabled\n"); + return efi_secureboot_mode_enabled; +} + +bool arch_ima_get_secureboot(void) +{ + static enum efi_secureboot_mode sb_mode; + static bool initialized; + + if (!initialized && efi_enabled(EFI_BOOT)) { + sb_mode = boot_params.secure_boot; + + if (sb_mode == efi_secureboot_mode_unset) + sb_mode = get_sb_mode(); + initialized = true; + } + + if (sb_mode == efi_secureboot_mode_enabled) + return true; + else + return false; +} + +/* secureboot arch rules */ +static const char * const sb_arch_rules[] = { +#if !IS_ENABLED(CONFIG_KEXEC_SIG) + "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", +#endif /* CONFIG_KEXEC_SIG */ + "measure func=KEXEC_KERNEL_CHECK", +#if !IS_ENABLED(CONFIG_MODULE_SIG) + "appraise func=MODULE_CHECK appraise_type=imasig", +#endif + "measure func=MODULE_CHECK", + NULL +}; + +const char * const *arch_get_ima_policy(void) +{ + if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_MODULE_SIG)) + set_module_sig_enforced(); + if (IS_ENABLED(CONFIG_KEXEC_SIG)) + set_kexec_sig_enforced(); + return sb_arch_rules; + } + return NULL; +} diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 000000000..fdb6506ce --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include +#include +#include +#include +#include +#include + +#define IO_DELAY_TYPE_0X80 0 +#define IO_DELAY_TYPE_0XED 1 +#define IO_DELAY_TYPE_UDELAY 2 +#define IO_DELAY_TYPE_NONE 3 + +#if defined(CONFIG_IO_DELAY_0X80) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80 +#elif defined(CONFIG_IO_DELAY_0XED) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED +#elif defined(CONFIG_IO_DELAY_UDELAY) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY +#elif defined(CONFIG_IO_DELAY_NONE) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE +#endif + +int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE; + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + break; + case IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == IO_DELAY_TYPE_0X80) { + pr_notice("%s: using 0xed I/O delay port\n", id->ident); + io_delay_type = IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!s) + return -EINVAL; + + if (!strcmp(s, "0x80")) + io_delay_type = IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c new file mode 100644 index 000000000..e2fab3ceb --- /dev/null +++ b/arch/x86/kernel/ioport.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_X86_IOPL_IOPERM + +static atomic64_t io_bitmap_sequence; + +void io_bitmap_share(struct task_struct *tsk) +{ + /* Can be NULL when current->thread.iopl_emul == 3 */ + if (current->thread.io_bitmap) { + /* + * Take a refcount on current's bitmap. It can be used by + * both tasks as long as none of them changes the bitmap. + */ + refcount_inc(¤t->thread.io_bitmap->refcnt); + tsk->thread.io_bitmap = current->thread.io_bitmap; + } + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); +} + +static void task_update_io_bitmap(struct task_struct *tsk) +{ + struct thread_struct *t = &tsk->thread; + + if (t->iopl_emul == 3 || t->io_bitmap) { + /* TSS update is handled on exit to user space */ + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); + } else { + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); + /* Invalidate TSS */ + preempt_disable(); + tss_update_io_bitmap(); + preempt_enable(); + } +} + +void io_bitmap_exit(struct task_struct *tsk) +{ + struct io_bitmap *iobm = tsk->thread.io_bitmap; + + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); + if (iobm && refcount_dec_and_test(&iobm->refcnt)) + kfree(iobm); +} + +/* + * This changes the io permissions bitmap in the current task. + */ +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct *t = ¤t->thread; + unsigned int i, max_long; + struct io_bitmap *iobm; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + iobm = t->io_bitmap; + if (!iobm) { + /* No point to allocate a bitmap just to clear permissions */ + if (!turn_on) + return 0; + iobm = kmalloc(sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + + memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap)); + refcount_set(&iobm->refcnt, 1); + } + + /* + * If the bitmap is not shared, then nothing can take a refcount as + * current can obviously not fork at the same time. If it's shared + * duplicate it and drop the refcount on the original one. + */ + if (refcount_read(&iobm->refcnt) > 1) { + iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL); + if (!iobm) + return -ENOMEM; + refcount_set(&iobm->refcnt, 1); + io_bitmap_exit(current); + } + + /* + * Store the bitmap pointer (might be the same if the task already + * head one). Must be done here so freeing the bitmap when all + * permissions are dropped has the pointer set up. + */ + t->io_bitmap = iobm; + /* Mark it active for context switching and exit to user mode */ + set_thread_flag(TIF_IO_BITMAP); + + /* + * Update the tasks bitmap. The update of the TSS bitmap happens on + * exit to user mode. So this needs no protection. + */ + if (turn_on) + bitmap_clear(iobm->bitmap, from, num); + else + bitmap_set(iobm->bitmap, from, num); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = UINT_MAX; + for (i = 0; i < IO_BITMAP_LONGS; i++) { + if (iobm->bitmap[i] != ~0UL) + max_long = i; + } + /* All permissions dropped? */ + if (max_long == UINT_MAX) { + io_bitmap_exit(current); + return 0; + } + + iobm->max = (max_long + 1) * sizeof(unsigned long); + + /* + * Update the sequence number to force a TSS update on return to + * user mode. + */ + iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + + return 0; +} + +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return ksys_ioperm(from, num, turn_on); +} + +/* + * The sys_iopl functionality depends on the level argument, which if + * granted for the task is used to enable access to all 65536 I/O ports. + * + * This does not use the IOPL mechanism provided by the CPU as that would + * also allow the user space task to use the CLI/STI instructions. + * + * Disabling interrupts in a user space task is dangerous as it might lock + * up the machine and the semantics vs. syscalls and exceptions is + * undefined. + * + * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3 + * 3 enables them. + * + * IOPL is strictly per thread and inherited on fork. + */ +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + struct thread_struct *t = ¤t->thread; + unsigned int old; + + if (level > 3) + return -EINVAL; + + old = t->iopl_emul; + + /* No point in going further if nothing changes */ + if (level == old) + return 0; + + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) + return -EPERM; + } + + t->iopl_emul = level; + task_update_io_bitmap(current); + + return 0; +} + +#else /* CONFIG_X86_IOPL_IOPERM */ + +long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return -ENOSYS; +} +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) +{ + return -ENOSYS; +} + +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + return -ENOSYS; +} +#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 000000000..ce904c89c --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +atomic_t irq_err_count; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); + + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + ack_APIC_irq(); +} + +#define irq_stats(x) (&per_cpu(irq_stat, x)) +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ + int j; + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_puts(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_puts(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_puts(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_puts(p, " Performance monitoring interrupts\n"); + seq_printf(p, "%*s: ", prec, "IWI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_puts(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_puts(p, " APIC ICR read retries\n"); + if (x86_platform_ipi_callback) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); + seq_puts(p, " Platform interrupts\n"); + } +#endif +#ifdef CONFIG_SMP + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_puts(p, " Rescheduling interrupts\n"); + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_puts(p, " Function call interrupts\n"); + seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_puts(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_puts(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_puts(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_puts(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_puts(p, " Machine check polls\n"); +#endif +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HYP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_callback_count); + seq_puts(p, " Hypervisor callback interrupts\n"); + } +#endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } + if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HVS"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->hyperv_stimer0_count); + seq_puts(p, " Hyper-V stimer0 interrupts\n"); + } +#endif + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "NPI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_nested_ipis); + seq_puts(p, " Nested posted-interrupt event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); +#endif + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + sum += irq_stats(cpu)->irq_thermal_count; +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + return sum; +} + +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); + else + __handle_irq(desc, regs); +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + desc = __this_cpu_read(vector_irq[vector]); + if (likely(!IS_ERR_OR_NULL(desc))) { + handle_irq(desc, regs); + } else { + ack_APIC_irq(); + + if (desc == VECTOR_UNUSED) { + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", + __func__, smp_processor_id(), + vector); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + } + } + + set_irq_regs(old_regs); +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; +/* + * Handler for X86_PLATFORM_IPI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + set_irq_regs(old_regs); +} +#endif + +#ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else { + kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) +{ + ack_APIC_irq(); + inc_irq_stat(kvm_posted_intr_ipis); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) +{ + ack_APIC_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); +} + +/* + * Handler for POSTED_INTERRUPT_NESTED_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) +{ + ack_APIC_irq(); + inc_irq_stat(kvm_posted_intr_nested_ipis); +} +#endif + + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irr, vector; + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + + irq_migrate_all_off_this_cpu(); + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + /* + * We can walk the vector array of this cpu without holding + * vector_lock because the cpu is already marked !online, so + * nothing else will touch it. + */ + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + desc = __this_cpu_read(vector_irq[vector]); + + raw_spin_lock(&desc->lock); + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip->irq_retrigger) { + chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } + raw_spin_unlock(&desc->lock); + } + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); + } +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c new file mode 100644 index 000000000..0b79efc87 --- /dev/null +++ b/arch/x86/kernel/irq_32.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); + +static void call_on_stack(void *func, void *stack) +{ + asm volatile("xchgl %%ebx,%%esp \n" + CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + [thunk_target] "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); +} + +static inline void *current_stack(void) +{ + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} + +static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +{ + struct irq_stack *curstk, *irqstk; + u32 *isp, *prev_esp, arg1; + + curstk = (struct irq_stack *) current_stack(); + irqstk = __this_cpu_read(hardirq_stack_ptr); + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curstk == irqstk)) + return 0; + + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=b" (isp) + : "0" (desc), "1" (isp), + [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "ecx"); + return 1; +} + +/* + * Allocate per-cpu stacks for hardirq and softirq processing + */ +int irq_init_percpu_irqstack(unsigned int cpu) +{ + int node = cpu_to_node(cpu); + struct page *ph, *ps; + + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } + + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; +} + +void do_softirq_own_stack(void) +{ + struct irq_stack *irqstk; + u32 *isp, *prev_esp; + + irqstk = __this_cpu_read(softirq_stack_ptr); + + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + call_on_stack(__do_softirq, isp); +} + +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) +{ + int overflow = check_stack_overflow(); + + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { + if (unlikely(overflow)) + print_stack_overflow(); + generic_handle_irq_desc(desc); + } +} diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c new file mode 100644 index 000000000..440eed558 --- /dev/null +++ b/arch/x86/kernel/irq_64.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); + +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; + + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); + + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } + + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); + if (!va) + return -ENOMEM; + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) +{ + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif + +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); +} + +void do_softirq_own_stack(void) +{ + run_on_irqstack_cond(__do_softirq, NULL); +} diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 000000000..890d4778c --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) +{ + ack_APIC_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + trace_irq_work_exit(IRQ_WORK_VECTOR); +} + +void arch_irq_work_raise(void) +{ + if (!arch_irq_work_has_interrupt()) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S new file mode 100644 index 000000000..a9c36400b --- /dev/null +++ b/arch/x86/kernel/irqflags.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + +/* + * unsigned long native_save_fl(void) + */ +SYM_FUNC_START(native_save_fl) + pushf + pop %_ASM_AX + RET +SYM_FUNC_END(native_save_fl) +EXPORT_SYMBOL(native_save_fl) + +/* + * void native_restore_fl(unsigned long flags) + * %eax/%rdi: flags + */ +SYM_FUNC_START(native_restore_fl) + push %_ASM_ARG1 + popf + RET +SYM_FUNC_END(native_restore_fl) +EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c new file mode 100644 index 000000000..c68366687 --- /dev/null +++ b/arch/x86/kernel/irqinit.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, +}; + +void __init init_ISA_irqs(void) +{ + struct irq_chip *chip = legacy_pic->chip; + int i; + + /* + * Try to set up the through-local-APIC virtual wire mode earlier. + * + * On some 32-bit UP machines, whose APIC has been disabled by BIOS + * and then got re-enabled by "lapic", it hangs at boot time without this. + */ + init_bsp_APIC(); + + legacy_pic->init(0); + + for (i = 0; i < nr_legacy_irqs(); i++) { + irq_set_chip_and_handler(i, chip, handle_level_irq); + irq_set_status_flags(i, IRQ_LEVEL); + } +} + +void __init init_IRQ(void) +{ + int i; + + /* + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQs are handled by more modern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < nr_legacy_irqs(); i++) + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + + x86_init.irqs.intr_init(); +} + +void __init native_init_IRQ(void) +{ + /* Execute any quirks before the call gates are initialised: */ + x86_init.irqs.pre_vector_init(); + + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); + + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { + /* IRQ2 is cascade interrupt to second interrupt controller */ + if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL)) + pr_err("%s: request_irq() failed\n", "cascade"); + } +} diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000..1afbdd1dd --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static struct ctl_table itmt_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = itmt_kern_table, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl_table(itmt_root_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + x86_topology_update = true; + rebuild_sched_domains(); + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of cpu core + * @core_cpu: The cpu number associated with the core + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPU with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int core_cpu) +{ + int cpu, i = 1; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + int smt_prio; + + /* + * Ensure that the siblings are moved to the end + * of the priority chain and only used when + * all other high priority cpus are out of capacity. + */ + smt_prio = prio * smp_num_siblings / i; + per_cpu(sched_core_priority, cpu) = smt_prio; + i++; + } +} diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c new file mode 100644 index 000000000..4eb8f2d19 --- /dev/null +++ b/arch/x86/kernel/jailhouse.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + * Jan Kiszka + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + +static unsigned int precalibrated_tsc_khz; + +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + +static uint32_t jailhouse_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0 || + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); +} + +static uint32_t __init jailhouse_detect(void) +{ + return jailhouse_cpuid_base(); +} + +static void jailhouse_get_wallclock(struct timespec64 *now) +{ + memset(now, 0, sizeof(*now)); +} + +static void __init jailhouse_timer_init(void) +{ + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); +} + +static unsigned long jailhouse_get_tsc(void) +{ + return precalibrated_tsc_khz; +} + +static void __init jailhouse_x2apic_init(void) +{ +#ifdef CONFIG_X86_X2APIC + if (!x2apic_enabled()) + return; + /* + * We do not have access to IR inside Jailhouse non-root cells. So + * we have to run in physical mode. + */ + x2apic_phys = 1; + /* + * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs + * ensure that only this APIC driver picks up the call. + */ + default_acpi_madt_oem_check("", ""); +#endif +} + +static void __init jailhouse_get_smp_config(unsigned int early) +{ + struct ioapic_domain_cfg ioapic_cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &mp_ioapic_irqdomain_ops, + }; + unsigned int cpu; + + jailhouse_x2apic_init(); + + register_lapic_address(0xfee00000); + + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { + generic_processor_info(setup_data.v1.cpu_ids[cpu], + boot_cpu_apic_version); + } + + smp_found_config = 1; + + if (setup_data.v1.standard_ioapic) { + mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); + + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } + } +} + +static void jailhouse_no_restart(void) +{ + pr_notice("Jailhouse: Restart not supported, halting\n"); + machine_halt(); +} + +static int __init jailhouse_pci_arch_init(void) +{ + pci_direct_init(1); + + /* + * There are no bridges on the virtual PCI root bus under Jailhouse, + * thus no other way to discover all devices than a full scan. + * Respect any overrides via the command line, though. + */ + if (pcibios_last_bus < 0) + pcibios_last_bus = 0xff; + +#ifdef CONFIG_PCI_MMCONFIG + if (setup_data.v1.pci_mmconfig_base) { + pci_mmconfig_add(0, 0, pcibios_last_bus, + setup_data.v1.pci_mmconfig_base); + pci_mmcfg_arch_init(); + } +#endif + + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + +static void __init jailhouse_init_platform(void) +{ + u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; + struct setup_data header; + void *mapping; + + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; + + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + + legacy_pic = &null_legacy_pic; + + machine_ops.emergency_restart = jailhouse_no_restart; + + while (pa_data) { + mapping = early_memremap(pa_data, sizeof(header)); + memcpy(&header, mapping, sizeof(header)); + early_memunmap(mapping, sizeof(header)); + + if (header.type == SETUP_JAILHOUSE) + break; + + pa_data = header.next; + } + + if (!pa_data) + panic("Jailhouse: No valid setup data found"); + + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; + pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); + + precalibrated_tsc_khz = setup_data.v1.tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + pci_probe = 0; + + /* + * Avoid that the kernel complains about missing ACPI tables - there + * are none in a non-root cell. + */ + disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); +} + +bool jailhouse_paravirt(void) +{ + return jailhouse_cpuid_base() != 0; +} + +static bool __init jailhouse_x2apic_available(void) +{ + /* + * The x2APIC is only available if the root cell enabled it. Jailhouse + * does not support switching between xAPIC and x2APIC. + */ + return x2apic_enabled(); +} + +const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { + .name = "Jailhouse", + .detect = jailhouse_detect, + .init.init_platform = jailhouse_init_platform, + .init.x2apic_available = jailhouse_x2apic_available, + .ignore_nopv = true, +}; diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 000000000..5ba8477c2 --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void bug_at(const void *ip, int line) +{ + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); + BUG(); +} + +static const void * +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) +{ + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + const void *expect, *code; + const void *addr, *dest; + int line; + + addr = (void *)jump_entry_code(entry); + dest = (void *)jump_entry_target(entry); + + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + + if (init) { + expect = default_nop; line = __LINE__; + } else if (type == JUMP_LABEL_JMP) { + expect = ideal_nop; line = __LINE__; + } else { + expect = code; line = __LINE__; + } + + if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) + bug_at(addr, line); + + if (type == JUMP_LABEL_NOP) + code = ideal_nop; + + return code; +} + +static inline void __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + const void *opcode = __jump_label_set_jump_code(entry, type, init); + + /* + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + */ + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), opcode, + JUMP_LABEL_NOP_SIZE); + return; + } + + text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); +} + +static void __ref jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + mutex_lock(&text_mutex); + __jump_label_transform(entry, type, init); + mutex_unlock(&text_mutex); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type, 0); +} + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + const void *opcode; + + if (system_state == SYSTEM_BOOTING) { + /* + * Fallback to the non-batching mode. + */ + arch_jump_label_transform(entry, type); + return true; + } + + mutex_lock(&text_mutex); + opcode = __jump_label_set_jump_code(entry, type, 0); + text_poke_queue((void *)jump_entry_code(entry), + opcode, JUMP_LABEL_NOP_SIZE, NULL); + mutex_unlock(&text_mutex); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + mutex_lock(&text_mutex); + text_poke_finish(); + mutex_unlock(&text_mutex); +} + +static enum { + JL_STATE_START, + JL_STATE_NO_UPDATE, + JL_STATE_UPDATE, +} jlstate __initdata_or_module = JL_STATE_START; + +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + /* + * This function is called at boot up and when modules are + * first loaded. Check if the default nop, the one that is + * inserted at compile time, is the ideal nop. If it is, then + * we do not need to update the nop, and we can leave it as is. + * If it is not, then we need to update the nop to the ideal nop. + */ + if (jlstate == JL_STATE_START) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (memcmp(ideal_nop, default_nop, 5) != 0) + jlstate = JL_STATE_UPDATE; + else + jlstate = JL_STATE_NO_UPDATE; + } + if (jlstate == JL_STATE_UPDATE) + jump_label_transform(entry, type, 1); +} diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 000000000..e2e89bebc --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +struct setup_data_node { + u64 paddr; + u32 type; + u32 len; +}; + +static ssize_t setup_data_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct setup_data_node *node = file->private_data; + unsigned long remain; + loff_t pos = *ppos; + void *p; + u64 pa; + + if (pos < 0) + return -EINVAL; + + if (pos >= node->len) + return 0; + + if (count > node->len - pos) + count = node->len - pos; + + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; + + remain = copy_to_user(user_buf, p, count); + + memunmap(p); + + if (remain) + return -EFAULT; + + *ppos = pos + count; + + return count; +} + +static const struct file_operations fops_setup_data = { + .read = setup_data_read, + .open = simple_open, + .llseek = default_llseek, +}; + +static void __init +create_setup_data_node(struct dentry *parent, int no, + struct setup_data_node *node) +{ + struct dentry *d; + char buf[16]; + + sprintf(buf, "%d", no); + d = debugfs_create_dir(buf, parent); + + debugfs_create_x32("type", S_IRUGO, d, &node->type); + debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); +} + +static int __init create_setup_data_nodes(struct dentry *parent) +{ + struct setup_indirect *indirect; + struct setup_data_node *node; + struct setup_data *data; + u64 pa_data, pa_next; + struct dentry *d; + int error; + u32 len; + int no = 0; + + d = debugfs_create_dir("setup_data", parent); + + pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + error = -ENOMEM; + goto err_dir; + } + + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + pa_next = data->next; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + + create_setup_data_node(d, no, node); + pa_data = pa_next; + + memunmap(data); + no++; + } + + return 0; + +err_dir: + debugfs_remove_recursive(d); + return error; +} + +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + struct dentry *dbp; + int error; + + dbp = debugfs_create_dir("boot_params", arch_debugfs_dir); + + debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version); + debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob); + + error = create_setup_data_nodes(dbp); + if (error) + debugfs_remove_recursive(dbp); + + return error; +} +#endif /* CONFIG_DEBUG_BOOT_PARAMS */ + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000..ce831f944 --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len = 0; + uint32_t cmdline_low_32, cmdline_ext_32; + + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr, + "elfcorehdr=0x%lx ", image->arch.elf_load_addr); + } + memcpy(cmdline_ptr + len, cmdline, cmdline_len); + cmdline_len += len; + + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_table_kexec->nr_entries; + + /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ + if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) + nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); + + return 0; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi_fw_vendor; + esd->tables = efi_config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!current_ei->efi_memmap_size) + return 0; + + params->secure_boot = boot_params.secure_boot; + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Always fill in RSDP: it is either 0 or a valid value */ + params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_table[i].type != E820_TYPE_RAM) + continue; + start = params->e820_table[i].addr; + end = params->e820_table[i].addr + params->e820_table[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + efi_setup_data_offset); +#endif + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +static int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be at least two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) { + pr_err("bzImage cannot handle 5-level paging mode.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +static void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; + struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR, + .buf_max = ULONG_MAX, .top_down = true }; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr= to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr= to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, &pbuf); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data); + + params = kzalloc(kbuf.bufsz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16); + + /* Copy setup header onto bootparams. Documentation/x86/boot.rst */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + bootparam_load_addr = kbuf.mem; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); + + /* Load kernel */ + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + kernel_load_addr = kbuf.mem; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kbuf.bufsz, kbuf.memsz); + + /* Load initrd high */ + if (initrd) { + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out_free_params; + initrd_load_addr = kbuf.mem; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +static int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + } + return ret; +} +#endif + +const struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif +}; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c new file mode 100644 index 000000000..ff7878df9 --- /dev/null +++ b/arch/x86/kernel/kgdb.c @@ -0,0 +1,785 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + */ + +/* + * Copyright (C) 2004 Amit S. Kale + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc. + */ +/**************************************************************************** + * Contributor: Lake Stevens Instrument Division$ + * Written by: Glenn Engel $ + * Updated by: Amit Kale + * Updated by: Tom Rini + * Updated by: Jason Wessel + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by + * David Grothe + * Integrated into 2.2.5 kernel by Tigran Aivazian + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = +{ +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, si) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, +#endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( +#ifdef CONFIG_X86_32 + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; + } + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + +#ifdef CONFIG_X86_32 + switch (regno) { + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; + } +#endif + return dbg_reg_def[regno].name; +} + +/** + * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs + * @gdb_regs: A pointer to hold the registers in the order GDB wants. + * @p: The &struct task_struct of the desired process. + * + * Convert the register values of the sleeping process in @p to + * the format that GDB expects. + * This function is called when kgdb does not have access to the + * &struct pt_regs and therefore it should fill the gdb registers + * @gdb_regs with what has been saved in &struct thread_struct + * thread field during switch_to. + */ +void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) +{ +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif + gdb_regs[GDB_AX] = 0; + gdb_regs[GDB_BX] = 0; + gdb_regs[GDB_CX] = 0; + gdb_regs[GDB_DX] = 0; + gdb_regs[GDB_SI] = 0; + gdb_regs[GDB_DI] = 0; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; +#ifdef CONFIG_X86_32 + gdb_regs[GDB_DS] = __KERNEL_DS; + gdb_regs[GDB_ES] = __KERNEL_DS; + gdb_regs[GDB_PS] = 0; + gdb_regs[GDB_CS] = __KERNEL_CS; + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_FS] = 0xFFFF; + gdb_regs[GDB_GS] = 0xFFFF; +#else + gdb_regs32[GDB_PS] = 0; + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_R8] = 0; + gdb_regs[GDB_R9] = 0; + gdb_regs[GDB_R10] = 0; + gdb_regs[GDB_R11] = 0; + gdb_regs[GDB_R12] = 0; + gdb_regs[GDB_R13] = 0; + gdb_regs[GDB_R14] = 0; + gdb_regs[GDB_R15] = 0; +#endif + gdb_regs[GDB_PC] = 0; + gdb_regs[GDB_SP] = p->thread.sp; +} + +static struct hw_breakpoint { + unsigned enabled; + unsigned long addr; + int len; + int type; + struct perf_event * __percpu *pev; +} breakinfo[HBP_NUM]; + +static unsigned long early_dr7; + +static void kgdb_correct_hw_break(void) +{ + int breakno; + + for (breakno = 0; breakno < HBP_NUM; breakno++) { + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + if (dbg_is_early) { + set_debugreg(breakinfo[breakno].addr, breakno); + early_dr7 |= encode_dr7(breakno, + breakinfo[breakno].len, + breakinfo[breakno].type); + set_debugreg(early_dr7, 7); + continue; + } + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; + } + if (!dbg_is_early) + hw_breakpoint_restore(); +} + +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responsible for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + +static int +kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (breakinfo[i].addr == addr && breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } + breakinfo[i].enabled = 0; + + return 0; +} + +static void kgdb_remove_all_hw_break(void) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + continue; + } + if (dbg_is_early) + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; + } +} + +static int +kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (!breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + switch (bptype) { + case BP_HARDWARE_BREAKPOINT: + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; + break; + case BP_WRITE_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_WRITE; + break; + case BP_ACCESS_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_RW; + break; + default: + return -1; + } + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -1; + } + breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; + + return 0; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +static void kgdb_disable_hw_debug(struct pt_regs *regs) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + /* Disable hardware debugging while we are in kgdb: */ + set_debugreg(0UL, 7); + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + if (dbg_is_early) { + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + continue; + } + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } +} + +#ifdef CONFIG_SMP +/** + * kgdb_roundup_cpus - Get other CPUs into a holding pattern + * + * On SMP systems, we need to get the attention of the other CPUs + * and get them be in a known state. This should do what is needed + * to get the other CPUs to call kgdb_wait(). Note that on some arches, + * the NMI approach is not used for rounding up all the CPUs. For example, + * in case of MIPS, smp_call_function() is used to roundup CPUs. + * + * On non-SMP systems, this is not called. + */ +void kgdb_roundup_cpus(void) +{ + apic_send_IPI_allbutself(NMI_VECTOR); +} +#endif + +/** + * kgdb_arch_handle_exception - Handle architecture specific GDB packets. + * @e_vector: The error vector of the exception that happened. + * @signo: The signal number of the exception that happened. + * @err_code: The error code of the exception that happened. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. + * + * This function MUST handle the 'c' and 's' command packets, + * as well packets to set / remove a hardware breakpoint, if used. + * If there are additional packets which the hardware needs to handle, + * they are handled here. The code should return -1 if it wants to + * process more packets, and a %0 or %1 if it wants to exit from the + * kgdb callback. + */ +int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, + char *remcomInBuffer, char *remcomOutBuffer, + struct pt_regs *linux_regs) +{ + unsigned long addr; + char *ptr; + + switch (remcomInBuffer[0]) { + case 'c': + case 's': + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (kgdb_hex2long(&ptr, &addr)) + linux_regs->ip = addr; + fallthrough; + case 'D': + case 'k': + /* clear the trace bit */ + linux_regs->flags &= ~X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, -1); + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') { + linux_regs->flags |= X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); + } + + return 0; + } + + /* this means that we do not want to exit from the handler: */ + return -1; +} + +static inline int +single_step_cont(struct pt_regs *regs, struct die_args *args) +{ + /* + * Single step exception from kernel space to user space so + * eat the exception and continue the process: + */ + printk(KERN_ERR "KGDB: trap/step from kernel to user space, " + "resuming...\n"); + kgdb_arch_handle_exception(args->trapnr, args->signr, + args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; + + return NOTIFY_STOP; +} + +static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS); + +static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + switch (cmd) { + case NMI_LOCAL: + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + cpu = raw_smp_processor_id(); + kgdb_nmicallback(cpu, regs); + set_bit(cpu, was_in_debug_nmi); + touch_nmi_watchdog(); + + return NMI_HANDLED; + } + break; + + case NMI_UNKNOWN: + cpu = raw_smp_processor_id(); + + if (__test_and_clear_bit(cpu, was_in_debug_nmi)) + return NMI_HANDLED; + + break; + default: + /* do nothing */ + break; + } + return NMI_DONE; +} + +static int __kgdb_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; + + switch (cmd) { + case DIE_DEBUG: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; + fallthrough; + default: + if (user_mode(regs)) + return NOTIFY_DONE; + } + + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) + return NOTIFY_DONE; + + /* Must touch watchdog before return to normal operation */ + touch_nmi_watchdog(); + return NOTIFY_STOP; +} + +int kgdb_ll_trap(int cmd, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + return __kgdb_notify(&args, cmd); +} + +static int +kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __kgdb_notify(ptr, cmd); + local_irq_restore(flags); + + return ret; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, +}; + +/** + * kgdb_arch_init - Perform any architecture specific initialization. + * + * This function will handle the initialization of any architecture + * specific callbacks. + */ +int kgdb_arch_init(void) +{ + int retval; + + retval = register_die_notifier(&kgdb_notifier); + if (retval) + goto out; + + retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler, + 0, "kgdb"); + if (retval) + goto out1; + + retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler, + 0, "kgdb"); + + if (retval) + goto out2; + + return retval; + +out2: + unregister_nmi_handler(NMI_LOCAL, "kgdb"); +out1: + unregister_die_notifier(&kgdb_notifier); +out: + return retval; +} + +static void kgdb_hw_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct task_struct *tsk = current; + int i; + + for (i = 0; i < 4; i++) { + if (breakinfo[i].enabled) + tsk->thread.virtual_dr6 |= (DR_TRAP0 << i); + } +} + +void kgdb_arch_late(void) +{ + int i, cpu; + struct perf_event_attr attr; + struct perf_event **pevent; + + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < HBP_NUM; i++) { + if (breakinfo[i].pev) + continue; + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); + if (IS_ERR((void * __force)breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw" + "breakpoints\nDisabling the kernel debugger\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + pevent[0]->overflow_handler = kgdb_hw_overflow_handler; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } +} + +/** + * kgdb_arch_exit - Perform any architecture specific uninitalization. + * + * This function will handle the uninitalization of any architecture + * specific callbacks, for dynamic registration and unregistration. + */ +void kgdb_arch_exit(void) +{ + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } + unregister_nmi_handler(NMI_UNKNOWN, "kgdb"); + unregister_nmi_handler(NMI_LOCAL, "kgdb"); + unregister_die_notifier(&kgdb_notifier); +} + +/** + * + * kgdb_skipexception - Bail out of KGDB when we've been triggered. + * @exception: Exception vector number + * @regs: Current &struct pt_regs. + * + * On some architectures we need to skip a breakpoint exception when + * it occurs after a breakpoint has been removed. + * + * Skip an int3 exception when it occurs after a breakpoint has been + * removed. Backtrack eip by 1 since the int3 would have caused it to + * increment by 1. + */ +int kgdb_skipexception(int exception, struct pt_regs *regs) +{ + if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) { + regs->ip -= 1; + return 1; + } + return 0; +} + +unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + if (exception == 3) + return instruction_pointer(regs) - 1; + return instruction_pointer(regs); +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + + bpt->type = BP_BREAKPOINT; + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + if (!err) + return err; + /* + * It is safe to call text_poke_kgdb() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + return -EBUSY; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + bpt->type = BP_POKE_BREAKPOINT; + + return 0; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + if (bpt->type != BP_POKE_BREAKPOINT) + goto knl_write; + /* + * It is safe to call text_poke_kgdb() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + goto knl_write; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; + +knl_write: + return copy_to_kernel_nofault((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + +const struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: */ + .gdb_bpt_instr = { 0xcc }, + .flags = KGDB_HW_BREAKPOINT, + .set_hw_breakpoint = kgdb_set_hw_break, + .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, + .remove_all_hw_break = kgdb_remove_all_hw_break, + .correct_hw_break = kgdb_correct_hw_break, +}; diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile new file mode 100644 index 000000000..8a753432b --- /dev/null +++ b/arch/x86/kernel/kprobes/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for kernel probes +# + +obj-$(CONFIG_KPROBES) += core.o +obj-$(CONFIG_OPTPROBES) += opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h new file mode 100644 index 000000000..7d3a2e2da --- /dev/null +++ b/arch/x86/kernel/kprobes/common.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#include +#include + +#ifdef CONFIG_X86_64 + +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" \ + ENCODE_FRAME_POINTER + +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else + +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $4*4, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" \ + ENCODE_FRAME_POINTER + +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\ + " addl $7*4, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(struct insn *insn, void *orig_addr); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *dest, void *from, void *to); +extern void synthesize_relcall(void *dest, void *from, void *to); + +#ifdef CONFIG_OPTPROBES +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else /* !CONFIG_OPTPROBES */ +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + return addr; +} +#endif + +#endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c new file mode 100644 index 000000000..e7edc9e4c --- /dev/null +++ b/arch/x86/kernel/kprobes/core.c @@ -0,0 +1,1152 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel Probes (KProbes) + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston and Prasanna S Panchamukhi + * adapted for x86_64 from i386. + * 2005-Mar Roland McGrath + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. + * 2005-May Rusty Lynch + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu , Arjan van de Ven + * and Jim Keniston + * unified x86 kprobes code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#define stack_addr(regs) ((unsigned long *)regs->sp) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + * This is non-const and volatile to keep gcc from statically + * optimizing it out, as variable_test_bit makes gcc think only + * *(unsigned long*) is used. + */ +static volatile u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; + +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +static nokprobe_inline void +__synthesize_relative_insn(void *dest, void *from, void *to, u8 op) +{ + struct __arch_relative_insn { + u8 op; + s32 raddr; + } __packed *insn; + + insn = (struct __arch_relative_insn *)dest; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +void synthesize_reljump(void *dest, void *from, void *to) +{ + __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_reljump); + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void synthesize_relcall(void *dest, void *from, void *to) +{ + __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_relcall); + +/* + * Returns non-zero if INSN is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +int can_boost(struct insn *insn, void *addr) +{ + kprobe_opcode_t opcode; + insn_byte_t prefix; + int i; + + if (search_exception_tables((unsigned long)addr)) + return 0; /* Page fault may occur on this address. */ + + /* 2nd-byte opcode */ + if (insn->opcode.nbytes == 2) + return test_bit(insn->opcode.bytes[1], + (unsigned long *)twobyte_is_boostable); + + if (insn->opcode.nbytes != 1) + return 0; + + for_each_insn_prefix(insn, i, prefix) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(prefix); + /* Can't boost Address-size override prefix and CS override prefix */ + if (prefix == 0x2e || inat_is_address_size_prefix(attr)) + return 0; + } + + opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0x62: /* bound */ + case 0x70 ... 0x7f: /* Conditional jumps */ + case 0x9a: /* Call far */ + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xcc ... 0xce: /* software exceptions */ + case 0xd0 ... 0xd3: /* Grp2 */ + case 0xd6: /* (UD) */ + case 0xd8 ... 0xdf: /* ESC */ + case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ + case 0xe8 ... 0xe9: /* near Call, JMP */ + case 0xeb: /* Short JMP */ + case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + case 0xf6 ... 0xf7: /* Grp3 */ + case 0xfe: /* Grp4 */ + /* ... are not boostable */ + return 0; + case 0xff: /* Grp5 */ + /* Only indirect jmp is boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; + default: + return 1; + } +} + +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + unsigned long faddr; + + kp = get_kprobe((void *)addr); + faddr = ftrace_location(addr); + /* + * Addresses inside the ftrace location are refused by + * arch_check_ftrace_location(). Something went terribly wrong + * if such an address is checked here. + */ + if (WARN_ON(faddr && faddr != addr)) + return 0UL; + /* + * Use the current code if it is not modified by Kprobe + * and it cannot be modified by ftrace. + */ + if (!kp && !faddr) + return addr; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, in case on normal Kprobe, kp->opcode has a copy + * of the first byte of the probed instruction, which is overwritten + * by int3. And the instruction at kp->addr is not modified by kprobes + * except for the first byte, we can recover the original instruction + * from it and kp->opcode. + * + * In case of Kprobes using ftrace, we do not have a copy of + * the original instruction. In fact, the ftrace location might + * be modified at anytime and even could be in an inconsistent state. + * Fortunately, we know that the original code is the ideal 5-byte + * long NOP. + */ + if (copy_from_kernel_nofault(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + + if (faddr) + memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + else + buf[0] = kp->opcode; + return (unsigned long)buf; +} + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + * Returns zero if the instruction can not get recovered (or access failed). + */ +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); +} + +/* Check if paddr is at an instruction boundary */ +static int can_probe(unsigned long paddr) +{ + unsigned long addr, __addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + int ret; + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + ret = insn_decode(&insn, (void *)__addr, MAX_INSN_SIZE, INSN_MODE_KERN); + if (ret < 0) + return 0; + +#ifdef CONFIG_KGDB + /* + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. + */ + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif + addr += insn.length; + } + + return (addr == paddr); +} + +/* + * Copy an instruction with recovering modified instruction by kprobes + * and adjust the displacement if the instruction uses the %rip-relative + * addressing mode. Note that since @real will be the final place of copied + * instruction, displacement must be adjust by @real, not @dest. + * This returns the length of copied instruction, or 0 if it has an error. + */ +int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) +{ + kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + int ret; + + if (!recovered_insn || !insn) + return 0; + + /* This can access kernel text if given address is not recovered */ + if (copy_from_kernel_nofault(dest, (void *)recovered_insn, + MAX_INSN_SIZE)) + return 0; + + ret = insn_decode(insn, dest, MAX_INSN_SIZE, INSN_MODE_KERN); + if (ret < 0) + return 0; + + /* We can not probe force emulate prefixed instruction */ + if (insn_has_emulate_prefix(insn)) + return 0; + + /* Another subsystem puts a breakpoint, failed to recover */ + if (insn->opcode.bytes[0] == INT3_INSN_OPCODE) + return 0; + + /* We should not singlestep on the exception masking instructions */ + if (insn_masking_exception(insn)) + return 0; + +#ifdef CONFIG_X86_64 + /* Only x86_64 has RIP relative instructions */ + if (insn_rip_relative(insn)) { + s64 newdisp; + u8 *disp; + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) src + (s64) insn->displacement.value + - (u8 *) real; + if ((s64) (s32) newdisp != newdisp) { + pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); + return 0; + } + disp = (u8 *) dest + insn_offset_displacement(insn); + *(s32 *) disp = (s32) newdisp; + } +#endif + return insn->length; +} + +/* Prepare reljump or int3 right after instruction */ +static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) +{ + int len = insn->length; + + if (!IS_ENABLED(CONFIG_PREEMPTION) && + !p->post_handler && can_boost(insn, p->addr) && + MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump(buf + len, p->ainsn.insn + len, + p->addr + insn->length); + len += JMP32_INSN_SIZE; + p->ainsn.boostable = 1; + } else { + /* Otherwise, put an int3 for trapping singlestep */ + if (MAX_INSN_SIZE - len < INT3_INSN_SIZE) + return -ENOSPC; + + buf[len] = INT3_INSN_OPCODE; + len += INT3_INSN_SIZE; + } + + return len; +} + +/* Make page to RO mode when allocate it */ +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (!page) + return NULL; + + set_vm_flush_reset_perms(page); + /* + * First make the page read-only, and only then make it executable to + * prevent it from being W+X in between. + */ + set_memory_ro((unsigned long)page, 1); + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_x((unsigned long)page, 1); + + return page; +} + +/* Recover page to RW mode before releasing it */ +void free_insn_page(void *page) +{ + module_memfree(page); +} + +/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ + +static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) +{ + switch (p->ainsn.opcode) { + case 0xfa: /* cli */ + regs->flags &= ~(X86_EFLAGS_IF); + break; + case 0xfb: /* sti */ + regs->flags |= X86_EFLAGS_IF; + break; + case 0x9c: /* pushf */ + int3_emulate_push(regs, regs->flags); + break; + case 0x9d: /* popf */ + regs->flags = int3_emulate_pop(regs); + break; + } + regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; +} +NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers); + +static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs) +{ + int3_emulate_ret(regs); +} +NOKPROBE_SYMBOL(kprobe_emulate_ret); + +static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + func += p->ainsn.rel32; + int3_emulate_call(regs, func); +} +NOKPROBE_SYMBOL(kprobe_emulate_call); + +static nokprobe_inline +void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + if (cond) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} + +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) +{ + __kprobe_emulate_jmp(p, regs, true); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp); + +static const unsigned long jcc_mask[6] = { + [0] = X86_EFLAGS_OF, + [1] = X86_EFLAGS_CF, + [2] = X86_EFLAGS_ZF, + [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, + [4] = X86_EFLAGS_SF, + [5] = X86_EFLAGS_PF, +}; + +static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) +{ + bool invert = p->ainsn.jcc.type & 1; + bool match; + + if (p->ainsn.jcc.type < 0xc) { + match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1]; + } else { + match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ + ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); + if (p->ainsn.jcc.type >= 0xe) + match = match || (regs->flags & X86_EFLAGS_ZF); + } + __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jcc); + +static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) +{ + bool match; + + if (p->ainsn.loop.type != 3) { /* LOOP* */ + if (p->ainsn.loop.asize == 32) + match = ((*(u32 *)®s->cx)--) != 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = ((*(u64 *)®s->cx)--) != 0; +#endif + else + match = ((*(u16 *)®s->cx)--) != 0; + } else { /* JCXZ */ + if (p->ainsn.loop.asize == 32) + match = *(u32 *)(®s->cx) == 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = *(u64 *)(®s->cx) == 0; +#endif + else + match = *(u16 *)(®s->cx) == 0; + } + + if (p->ainsn.loop.type == 0) /* LOOPNE */ + match = match && !(regs->flags & X86_EFLAGS_ZF); + else if (p->ainsn.loop.type == 1) /* LOOPE */ + match = match && (regs->flags & X86_EFLAGS_ZF); + + __kprobe_emulate_jmp(p, regs, match); +} +NOKPROBE_SYMBOL(kprobe_emulate_loop); + +static const int addrmode_regoffs[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif +}; + +static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size); + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); + +static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect); + +static int prepare_emulation(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9c: /* pushfl */ + case 0x9d: /* popf/popfd */ + /* + * IF modifiers must be emulated since it will enable interrupt while + * int3 single stepping. + */ + p->ainsn.emulate_op = kprobe_emulate_ifmodifiers; + p->ainsn.opcode = opcode; + break; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + p->ainsn.emulate_op = kprobe_emulate_ret; + break; + case 0x9a: /* far call absolute -- segment is not supported */ + case 0xea: /* far jmp absolute -- segment is not supported */ + case 0xcc: /* int3 */ + case 0xcf: /* iret -- in-kernel IRET is not supported */ + return -EOPNOTSUPP; + break; + case 0xe8: /* near call relative */ + p->ainsn.emulate_op = kprobe_emulate_call; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0xeb: /* short jump relative */ + case 0xe9: /* near jump relative */ + p->ainsn.emulate_op = kprobe_emulate_jmp; + if (insn->immediate.nbytes == 1) + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + else if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0x70 ... 0x7f: + /* 1 byte conditional jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + p->ainsn.rel32 = *(char *)insn->immediate.bytes; + break; + case 0x0f: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0xf0) == 0x80) { + /* 2 bytes Conditional Jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + } else if (opcode == 0x01 && + X86_MODRM_REG(insn->modrm.bytes[0]) == 0 && + X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) { + /* VM extensions - not supported */ + return -EOPNOTSUPP; + } + break; + case 0xe0: /* Loop NZ */ + case 0xe1: /* Loop */ + case 0xe2: /* Loop */ + case 0xe3: /* J*CXZ */ + p->ainsn.emulate_op = kprobe_emulate_loop; + p->ainsn.loop.type = opcode & 0x3; + p->ainsn.loop.asize = insn->addr_bytes * 8; + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + break; + case 0xff: + /* + * Since the 0xff is an extended group opcode, the instruction + * is determined by the MOD/RM byte. + */ + opcode = insn->modrm.bytes[0]; + if ((opcode & 0x30) == 0x10) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far call */ + /* call absolute, indirect */ + p->ainsn.emulate_op = kprobe_emulate_call_indirect; + } else if ((opcode & 0x30) == 0x20) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far jmp */ + /* jmp near absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; + } else + break; + + if (insn->addr_bytes != sizeof(unsigned long)) + return -EOPNOTSUPP; /* Don't support differnt size */ + if (X86_MODRM_MOD(opcode) != 3) + return -EOPNOTSUPP; /* TODO: support memory addressing */ + + p->ainsn.indirect.reg = X86_MODRM_RM(opcode); +#ifdef CONFIG_X86_64 + if (X86_REX_B(insn->rex_prefix.value)) + p->ainsn.indirect.reg += 8; +#endif + break; + default: + break; + } + p->ainsn.size = insn->length; + + return 0; +} + +static int arch_copy_kprobe(struct kprobe *p) +{ + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + int ret, len; + + /* Copy an instruction with recovering if other optprobe modifies it.*/ + len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); + if (!len) + return -EINVAL; + + /* Analyze the opcode and setup emulate functions */ + ret = prepare_emulation(p, &insn); + if (ret < 0) + return ret; + + /* Add int3 for single-step or booster jmp */ + len = prepare_singlestep(buf, p, &insn); + if (len < 0) + return len; + + /* Also, displacement change doesn't affect the first byte */ + p->opcode = buf[0]; + + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + + /* OK, write back the instruction(s) into ROX insn buffer */ + text_poke(p->ainsn.insn, buf, len); + + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + int ret; + + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + + ret = arch_copy_kprobe(p); + if (ret) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } + + return ret; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); + text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); + text_poke(p->addr, &p->opcode, 1); + text_poke_sync(); +} + +void arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); + free_insn_slot(p->ainsn.insn, p->ainsn.boostable); + p->ainsn.insn = NULL; + } +} + +static nokprobe_inline void +save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static nokprobe_inline void +restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static nokprobe_inline void +set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, p); + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & X86_EFLAGS_IF); +} + +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long *sara = stack_addr(regs); + + ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} +NOKPROBE_SYMBOL(arch_prepare_kretprobe); + +static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + /* This will restore both kcb and current_kprobe */ + restore_previous_kprobe(kcb); + } else { + /* + * Always update the kcb status because + * reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); + reset_current_kprobe(); + } +} +NOKPROBE_SYMBOL(kprobe_post_process); + +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPTION) + if (p->ainsn.boostable) { + /* Boost up -- we can execute copied instructions directly */ + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ + regs->ip = (unsigned long)p->ainsn.insn; + return; + } +#endif + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + + if (p->ainsn.emulate_op) { + p->ainsn.emulate_op(p, regs); + kprobe_post_process(p, regs, kcb); + return; + } + + /* Disable interrupt, and set ip register on trampoline */ + regs->flags &= ~X86_EFLAGS_IF; + regs->ip = (unsigned long)p->ainsn.insn; +} +NOKPROBE_SYMBOL(setup_singlestep); + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again + * right after the copied instruction. + * Different from the trap single-step, "int3" single-step can not + * handle the instruction which changes the ip register, e.g. jmp, + * call, conditional jmp, and the instructions which changes the IF + * flags because interrupt must be disabled around the single-stepping. + * Such instructions are software emulated, but others are single-stepped + * using "int3". + * + * When the 2nd "int3" handled, the regs->ip and regs->flags needs to + * be adjusted, so that we can resume execution on correct code. + */ +static void resume_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + + /* Restore saved interrupt flag and ip register */ + regs->flags |= kcb->kprobe_saved_flags; + /* Note that regs->ip is executed int3 so must be a step back */ + regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE; +} +NOKPROBE_SYMBOL(resume_singlestep); + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SS: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_REENTER: + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + pr_err("Unrecoverable kprobe detected.\n"); + dump_kprobe(p); + BUG(); + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} +NOKPROBE_SYMBOL(reenter_kprobe); + +static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb) +{ + return (kcb->kprobe_status == KPROBE_HIT_SS || + kcb->kprobe_status == KPROBE_REENTER); +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled throughout this function. + */ +int kprobe_int3_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + if (user_mode(regs)) + return 0; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + /* + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. + */ + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); + return 1; + } + } else if (kprobe_is_ss(kcb)) { + p = kprobe_running(); + if ((unsigned long)p->ainsn.insn < regs->ip && + (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) { + /* Most provably this is the second int3 for singlestep */ + resume_singlestep(p, regs, kcb); + kprobe_post_process(p, regs, kcb); + return 1; + } + } + + if (*addr != INT3_INSN_OPCODE) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + return 1; + } /* else: not a kprobe fault; let the kernel handle it */ + + return 0; +} +NOKPROBE_SYMBOL(kprobe_int3_handler); + +/* + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. + */ +asm( + ".text\n" + ".global kretprobe_trampoline\n" + ".type kretprobe_trampoline, @function\n" + "kretprobe_trampoline:\n" + /* We don't bother saving the ss register */ +#ifdef CONFIG_X86_64 + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 19*8(%rsp)\n" + RESTORE_REGS_STRING + " popfq\n" +#else + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movl %eax, 15*4(%esp)\n" + RESTORE_REGS_STRING + " popfl\n" +#endif + ASM_RET + ".size kretprobe_trampoline, .-kretprobe_trampoline\n" +); +NOKPROBE_SYMBOL(kretprobe_trampoline); +STACK_FRAME_NON_STANDARD(kretprobe_trampoline); + + +/* + * Called from kretprobe_trampoline + */ +__used __visible void *trampoline_handler(struct pt_regs *regs) +{ + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&kretprobe_trampoline; + regs->orig_ax = ~0UL; + + return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, ®s->sp); +} +NOKPROBE_SYMBOL(trampoline_handler); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + + /* + * If the IF flag was set before the kprobe hit, + * don't touch it: + */ + regs->flags |= kcb->kprobe_old_flags; + + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__entry_text_start, + (unsigned long)__entry_text_end); +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c new file mode 100644 index 000000000..681a4b36e --- /dev/null +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Dynamic Ftrace based Kprobes Optimization + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include + +#include "common.h" + +/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + /* Preempt is disabled by ftrace */ + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + return; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + unsigned long orig_ip = regs->ip; + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; + } + /* + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. + */ + __this_cpu_write(current_kprobe, NULL); + } +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = false; + return 0; +} diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c new file mode 100644 index 000000000..e37e5e824 --- /dev/null +++ b/arch/x86/kernel/kprobes/opt.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Kernel Probes Jump Optimization (Optprobes) + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < JMP32_INSN_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op is optimized or under unoptimizing */ + if (list_empty(&op->list) || optprobe_queued_unopt(op)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + if (copy_from_kernel_nofault(buf, (void *)addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) + return 0UL; + + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs); + } + + return (unsigned long)buf; +} + +static void synthesize_clac(kprobe_opcode_t *addr) +{ + /* + * Can't be static_cpu_has() due to how objtool treats this feature bit. + * This isn't a fast path anyway. + */ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return; + + /* Replace the NOP3 with CLAC */ + addr[0] = 0x0f; + addr[1] = 0x01; + addr[2] = 0xca; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +asm ( + ".pushsection .rodata\n" + "optprobe_template_func:\n" + ".global optprobe_template_entry\n" + "optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 18*8(%rsp), %rdx\n" + " movq %rdx, 19*8(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushl %esp\n" + " pushfl\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Move flags into esp */ + " movl 14*4(%esp), %edx\n" + " movl %edx, 15*4(%esp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addl $4, %esp\n" + " popfl\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end:\n" + ".popsection\n"); + +void optprobe_template_func(void); +STACK_FRAME_NON_STANDARD(optprobe_template_func); + +#define TMPL_CLAC_IDX \ + ((long)optprobe_template_clac - (long)optprobe_template_entry) +#define TMPL_MOVE_IDX \ + ((long)optprobe_template_val - (long)optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)optprobe_template_call - (long)optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)optprobe_template_end - (long)optprobe_template_entry) + +/* Optimized kprobe call back function: called from optinsn */ +static void +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Save skipped registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; + regs->orig_ax = ~0UL; + + __this_cpu_write(current_kprobe, &op->kp); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + preempt_enable(); +} +NOKPROBE_SYMBOL(optimized_callback); + +static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) +{ + struct insn insn; + int len = 0, ret; + + while (len < JMP32_INSN_SIZE) { + ret = __copy_instruction(dest + len, src + len, real + len, &insn); + if (!ret || !can_boost(&insn, src + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1) || + static_call_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +static int insn_is_indirect_jump(struct insn *insn) +{ + int ret = __insn_is_indirect_jump(insn); + +#ifdef CONFIG_RETPOLINE + /* + * Jump to x86_indirect_thunk_* is treated as an indirect jump. + * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with + * older gcc may use indirect jump. So we add this check instead of + * replace indirect-jump check. + */ + if (!ret) + ret = insn_jump_into_range(insn, + (unsigned long)__indirect_thunk_start, + (unsigned long)__indirect_thunk_end - + (unsigned long)__indirect_thunk_start); +#endif + return ret; +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* + * Do not optimize in the entry code due to the unstable + * stack handling and registers setup. + */ + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end))) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < JMP32_INSN_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; + int ret; + + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + recovered_insn = recover_probed_instruction(buf, addr); + if (!recovered_insn) + return 0; + + ret = insn_decode(&insn, (void *)recovered_insn, MAX_INSN_SIZE, INSN_MODE_KERN); + if (ret < 0) + return 0; +#ifdef CONFIG_KGDB + /* + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. + */ + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE, + DISP32_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disarmed(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) +{ + u8 *buf = NULL, *slot; + int ret, len; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + op->optinsn.insn = slot = get_optinsn_slot(); + if (!slot) { + ret = -ENOMEM; + goto out; + } + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE; + if (abs(rel) > 0x7fffffff) { + ret = -ERANGE; + goto err; + } + + /* Copy arch-dep-instance from template */ + memcpy(buf, optprobe_template_entry, TMPL_END_IDX); + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, + slot + TMPL_END_IDX); + if (ret < 0) + goto err; + op->optinsn.size = ret; + len = TMPL_END_IDX + op->optinsn.size; + + synthesize_clac(buf + TMPL_CLAC_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, + slot + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + len, slot + len, + (u8 *)op->kp.addr + op->optinsn.size); + len += JMP32_INSN_SIZE; + + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); + text_poke(slot, buf, len); + + ret = 0; +out: + kfree(buf); + return ret; + +err: + __arch_remove_optimized_kprobe(op, 0); + goto out; +} + +/* + * Replace breakpoints (INT3) with relative jumps (JMP.d32). + * Caller must call with locking kprobe_mutex and text_mutex. + * + * The caller will have installed a regular kprobe and after that issued + * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in + * the 4 bytes after the INT3 are unused and can now be overwritten. + */ +void arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + u8 insn_buff[JMP32_INSN_SIZE]; + + list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + JMP32_INSN_SIZE)); + + WARN_ON(kprobe_disabled(&op->kp)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE, + DISP32_SIZE); + + insn_buff[0] = JMP32_INSN_OPCODE; + *(s32 *)(&insn_buff[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + + list_del_init(&op->list); + } +} + +/* + * Replace a relative jump (JMP.d32) with a breakpoint (INT3). + * + * After that, we can restore the 4 bytes after the INT3 to undo what + * arch_optimize_kprobes() scribbled. This is safe since those bytes will be + * unused once the INT3 lands. + */ +void arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); + text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, oplist, list) { + arch_unoptimize_kprobe(op); + list_move(&op->list, done_list); + } +} + +int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + return 1; + } + return 0; +} +NOKPROBE_SYMBOL(setup_detour_execution); diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 000000000..257892fce --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static const struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + pa_data = data->next; + memunmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; + struct setup_data *data; + int i = 0; + u32 len; + + while (pa_data) { + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + pa_next = data->next; + + if (nr == i) { + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { + *size = data->len; + } + + memunmap(data); + return 0; + } + + pa_data = pa_next; + memunmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct setup_indirect *indirect; + struct setup_data *data; + int nr, ret; + u64 paddr; + u32 len; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { + ret = sprintf(buf, "0x%x\n", data->type); + } + + memunmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + struct setup_indirect *indirect; + struct setup_data *data; + int nr, ret = 0; + u64 paddr, len; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); + if (!data) + return -ENOMEM; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { + ret = -EINVAL; + goto out; + } + + if (count > len - off) + count = len - off; + + if (!count) + goto out; + + ret = count; + p = memremap(paddr, len, MEMREMAP_WB); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + memunmap(p); +out: + memunmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr __ro_after_init = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static const struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + memunmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j >= 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 000000000..fe9babe94 --- /dev/null +++ b/arch/x86/kernel/kvm.c @@ -0,0 +1,1054 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * KVM paravirt_ops implementation + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar + * Copyright IBM Corporation, 2007 + * Authors: Anthony Liguori + */ + +#define pr_fmt(fmt) "kvm-guest: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + +static int kvmapf = 1; + +static int __init parse_no_kvmapf(char *arg) +{ + kvmapf = 0; + return 0; +} + +early_param("no-kvmapf", parse_no_kvmapf); + +static int steal_acc = 1; +static int __init parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; +static int has_steal_clock = 0; + +static int has_guest_poll = 0; +/* + * No need for any "IO delay" on KVM + */ +static void kvm_io_delay(void) +{ +} + +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *e; + + raw_spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + raw_spin_unlock(&b->lock); + kfree(e); + return false; + } + + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; + + for (;;) { + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + local_irq_enable(); + schedule(); + local_irq_disable(); + } + finish_swait(&n.wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (swq_has_sleeper(&n->wq)) + swake_up_one(&n->wq); +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + + raw_spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + n = hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + raw_spin_unlock(&b->lock); + } +} + +void kvm_async_pf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n, *dummy = NULL; + + if (token == ~0) { + apf_task_wake_all(); + return; + } + +again: + raw_spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. + */ + if (!dummy) { + raw_spin_unlock(&b->lock); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + + /* + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. + */ + goto again; + } + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; + } else { + apf_task_wake_one(n); + } + raw_spin_unlock(&b->lock); + + /* A dummy token might be allocated and ultimately not used. */ + if (dummy) + kfree(dummy); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); + +noinstr u32 kvm_read_and_reset_apf_flags(void) +{ + u32 flags = 0; + + if (__this_cpu_read(apf_reason.enabled)) { + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); + } + + return flags; +} +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); + +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + u32 flags = kvm_read_and_reset_apf_flags(); + irqentry_state_t state; + + if (!flags) + return false; + + state = irqentry_enter(regs); + instrumentation_begin(); + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); + } + + instrumentation_end(); + irqentry_exit(regs, state); + return true; +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + + ack_APIC_irq(); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + set_irq_regs(old_regs); +} + +static void __init paravirt_ops_setup(void) +{ + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_ops.cpu.io_delay = kvm_io_delay; + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif +} + +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); +} + +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) + return; + apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); +} + +static void kvm_guest_cpu_init(void) +{ + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) + pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + __this_cpu_write(apf_reason.enabled, 1); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __this_cpu_write(kvm_apic_eoi, 0); + pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) + | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + + if (has_steal_clock) + kvm_register_steal_time(); +} + +static void kvm_pv_disable_apf(void) +{ + if (!__this_cpu_read(apf_reason.enabled)) + return; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + __this_cpu_write(apf_reason.enabled, 0); + + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); +} + +static void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + kvm_disable_steal_time(); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + virt_rmb(); + steal = src->steal; + virt_rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!sev_active()) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + +static bool pv_tlb_flush_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + +static void kvm_guest_cpu_offline(bool shutdown) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + if (!shutdown) + apf_task_wake_all(); + kvmclock_disable(); +} + +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} + +#ifdef CONFIG_SMP + +static bool pv_ipi_supported(void) +{ + return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); +} + +static bool pv_sched_yield_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + long ret; + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + const struct cpumask *local_mask; + + cpumask_copy(new_mask, mask); + cpumask_clear_cpu(this_cpu, new_mask); + local_mask = new_mask; + __send_ipi_mask(local_mask, vector); +} + +/* + * Set the IPI entry points + */ +static void kvm_setup_pv_ipi(void) +{ + apic->send_IPI_mask = kvm_send_ipi_mask; + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + pr_info("setup PV IPIs\n"); +} + +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) +{ + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } +} + +static void __init kvm_smp_prepare_boot_cpu(void) +{ + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + + kvm_guest_cpu_init(); + native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); +} + +static int kvm_cpu_down_prepare(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_offline(false); + local_irq_restore(flags); + return 0; +} + +#endif + +static int kvm_suspend(void) +{ + u64 val = 0; + + kvm_guest_cpu_offline(false); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif + return 0; +} + +static void kvm_resume(void) +{ + kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif + +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + +static void __init kvm_guest_init(void) +{ + int i; + + paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + raw_spin_lock_init(&async_pf_sleepers[i].lock); + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_ops.time.steal_clock = kvm_steal_clock; + } + + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (pv_sched_yield_supported()) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("setup PV sched yield\n"); + } + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", + kvm_cpu_online, kvm_cpu_down_prepare) < 0) + pr_err("failed to install cpu hotplug callbacks\n"); +#else + sev_map_percpu_data(); + kvm_guest_cpu_init(); +#endif + +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Hard lockup detection is enabled by default. Disable it, as guests + * can get false positives too easily, for example if the host is + * overcommitted. + */ + hardlockup_detector_disable(); +} + +static noinline uint32_t __kvm_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + + return 0; +} + +static inline uint32_t kvm_cpuid_base(void) +{ + static int kvm_cpuid_base = -1; + + if (kvm_cpuid_base == -1) + kvm_cpuid_base = __kvm_cpuid_base(); + + return kvm_cpuid_base; +} + +bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; +} +EXPORT_SYMBOL_GPL(kvm_para_available); + +unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} + +unsigned int kvm_arch_para_hints(void) +{ + return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} +EXPORT_SYMBOL_GPL(kvm_arch_para_hints); + +static uint32_t __init kvm_detect(void) +{ + return kvm_cpuid_base(); +} + +static void __init kvm_apic_init(void) +{ +#if defined(CONFIG_SMP) + if (pv_ipi_supported()) + kvm_setup_pv_ipi(); +#endif +} + +static void __init kvm_init_platform(void) +{ + kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; +} + +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + +const __initconst struct hypervisor_x86 x86_hyper_kvm = { + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif +}; + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +static __init int kvm_alloc_cpumask(void) +{ + int cpu; + bool alloc = false; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported()) + alloc = true; + +#if defined(CONFIG_SMP) + if (pv_ipi_supported()) + alloc = true; +#endif + + if (alloc) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; +} +arch_initcall(kvm_alloc_cpumask); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +#include + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_32 +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +ASM_RET +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" +".popsection"); + +#endif + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); + return; + } + + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } + + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); + + __pv_init_lock_hash(); + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = + PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = kvm_wait; + pv_ops.lock.kick = kvm_kick_cpu; + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); + return; + } + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + /* Disable guest halt poll enables host halt poll */ + smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000..b79b4ee49 --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init; +static int msr_kvm_wall_clock __ro_after_init; +static u64 kvm_sched_clock_offset __ro_after_init; + +static int __init parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; +static struct pvclock_vsyscall_time_info *hvclock_mem; +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); + +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +static void kvm_get_wallclock(struct timespec64 *now) +{ + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); +} + +static int kvm_set_wallclock(const struct timespec64 *now) +{ + return -ENODEV; +} + +static u64 kvm_clock_read(void) +{ + u64 ret; + + preempt_disable_notrace(); + ret = pvclock_clocksource_read(this_cpu_pvti()); + preempt_enable_notrace(); + return ret; +} + +static u64 kvm_clock_get_cycles(struct clocksource *cs) +{ + return kvm_clock_read(); +} + +static u64 kvm_sched_clock_read(void) +{ + return kvm_clock_read() - kvm_sched_clock_offset; +} + +static inline void kvm_sched_clock_init(bool stable) +{ + if (!stable) + clear_sched_clock_stable(); + kvm_sched_clock_offset = kvm_clock_read(); + pv_ops.time.sched_clock = kvm_sched_clock_read; + + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); + + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); +} + +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + return pvclock_tsc_khz(this_cpu_pvti()); +} + +static void __init kvm_get_preset_lpj(void) +{ + unsigned long khz; + u64 lpj; + + khz = kvm_get_tsc_khz(); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + +bool kvm_check_and_clear_guest_paused(void) +{ + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + bool ret = false; + + if (!src) + return ret; + + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + ret = true; + } + return ret; +} + +static int kvm_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + +struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = kvm_cs_enable, +}; +EXPORT_SYMBOL_GPL(kvm_clock); + +static void kvm_register_clock(char *txt) +{ + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; + + if (!src) + return; + + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); +} + +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static void kvm_setup_secondary_clock(void) +{ + kvm_register_clock("secondary cpu clock"); +} +#endif + +void kvmclock_disable(void) +{ + if (msr_kvm_system_time) + native_write_msr(msr_kvm_system_time, 0, 0); +} + +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + +static int __init kvm_setup_vsyscall_timeinfo(void) +{ + kvmclock_init_mem(); + +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; + + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; + + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } +#endif + + return 0; +} +early_initcall(kvm_setup_vsyscall_timeinfo); + +static int kvmclock_setup_percpu(unsigned int cpu) +{ + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); + + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; + else + return -ENOMEM; + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; +} + +void __init kvmclock_init(void) +{ + u8 flags; + + if (!kvm_para_available() || !kvmclock) + return; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + } else { + return; + } + + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { + return; + } + + pr_info("kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); + + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; +#endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; + kvm_get_preset_lpj(); + + /* + * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * Invariant TSC exposed by host means kvmclock is not necessary: + * can use TSC as clocksource. + * + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + !check_tsc_unstable()) + kvm_clock.rating = 299; + + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); + pv_info.name = "KVM"; +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c new file mode 100644 index 000000000..b8aee7184 --- /dev/null +++ b/arch/x86/kernel/ldt.c @@ -0,0 +1,688 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_lock + * context.lock + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +} + +void load_mm_ldt(struct mm_struct *mm) +{ + struct ldt_struct *ldt; + + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all + * CPUs with the mm active. The LDT will not be freed until + * after the IPI is handled by all such CPUs. This means that, + * if the ldt_struct changes before we return, the values we see + * will be safe, and the new values will be loaded before we run + * any user code. + * + * NB: don't try to convert this to use RCU without extreme care. + * We would still need IRQs off, because we don't want to change + * the local LDT after an IPI loaded a newer value than the one + * that we can see. + */ + + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { + clear_LDT(); + } +} + +void switch_ldt(struct mm_struct *prev, struct mm_struct *next) +{ + /* + * Load the LDT if either the old or new mm had an LDT. + * + * An mm will never go from having an LDT to not having an LDT. Two + * mms never share an LDT, so we don't gain anything by checking to + * see whether the LDT changed. There's also no guarantee that + * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, + * then prev->context.ldt will also be non-NULL. + * + * If we really cared, we could optimize the case where prev == next + * and we're exiting lazy mode. Most of the time, if this happens, + * we don't actually need to reload LDTR, but modify_ldt() is mostly + * used by legacy code and emulators where we don't need this level of + * performance. + * + * This uses | instead of || because it generates better code. + */ + if (unlikely((unsigned long)prev->context.ldt | + (unsigned long)next->context.ldt)) + load_mm_ldt(next); + + DEBUG_LOCKS_WARN_ON(preemptible()); +} + +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + +/* context.lock is held by the task which issued the smp function call */ +static void flush_ldt(void *__mm) +{ + struct mm_struct *mm = __mm; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) + return; + + load_mm_ldt(mm); + + refresh_ldt_segments(); +} + +/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ +static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) +{ + struct ldt_struct *new_ldt; + unsigned int alloc_size; + + if (num_entries > LDT_ENTRIES) + return NULL; + + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + if (!new_ldt) + return NULL; + + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); + alloc_size = num_entries * LDT_ENTRY_SIZE; + + /* + * Xen is very picky: it requires a page-aligned LDT that has no + * trailing nonzero bytes in any page that contains LDT descriptors. + * Keep it simple: zero the whole allocation and never allocate less + * than PAGE_SIZE. + */ + if (alloc_size > PAGE_SIZE) + new_ldt->entries = vzalloc(alloc_size); + else + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + + if (!new_ldt->entries) { + kfree(new_ldt); + return NULL; + } + + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + + new_ldt->nr_entries = num_entries; + return new_ldt; +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (boot_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (boot_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#endif /* CONFIG_X86_PAE */ + +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + unsigned long va; + bool is_vmalloc; + spinlock_t *ptl; + int i, nr_pages; + + if (!boot_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pgprot_t pte_prot; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); + /* Filter out unsuppored __PAGE_KERNEL* bits: */ + pgprot_val(pte_prot) &= __supported_pte_mask; + pte = pfn_pte(pfn, pte_prot); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); + + ldt->slot = slot; + return 0; +} + +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ + unsigned long va; + int i, nr_pages; + + if (!ldt) + return; + + /* LDT map/unmap is only required for PTI */ + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + unsigned long offset = i << PAGE_SHIFT; + spinlock_t *ptl; + pte_t *ptep; + + va = (unsigned long)ldt_slot_va(ldt->slot) + offset; + ptep = get_locked_pte(mm, va, &ptl); + pte_clear(mm, va, ptep); + pte_unmap_unlock(ptep, ptl); + } + + va = (unsigned long)ldt_slot_va(ldt->slot); + flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); +} + +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} + +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = LDT_END_ADDR; + + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + +/* After calling this, the LDT is immutable. */ +static void finalize_ldt_struct(struct ldt_struct *ldt) +{ + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); +} + +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) +{ + mutex_lock(&mm->context.lock); + + /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(&mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + + mutex_unlock(&mm->context.lock); +} + +static void free_ldt_struct(struct ldt_struct *ldt) +{ + if (likely(!ldt)) + return; + + paravirt_free_ldt(ldt->entries, ldt->nr_entries); + if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree_atomic(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + +/* + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. + */ +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) +{ + struct ldt_struct *new_ldt; + int retval = 0; + + if (!old_mm) + return 0; + + mutex_lock(&old_mm->context.lock); + if (!old_mm->context.ldt) + goto out_unlock; + + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); + if (!new_ldt) { + retval = -ENOMEM; + goto out_unlock; + } + + memcpy(new_ldt->entries, old_mm->context.ldt->entries, + new_ldt->nr_entries * LDT_ENTRY_SIZE); + finalize_ldt_struct(new_ldt); + + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + mm->context.ldt = new_ldt; + +out_unlock: + mutex_unlock(&old_mm->context.lock); + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context_ldt(struct mm_struct *mm) +{ + free_ldt_struct(mm->context.ldt); + mm->context.ldt = NULL; +} + +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + struct mm_struct *mm = current->mm; + unsigned long entries_size; + int retval; + + down_read(&mm->context.ldt_usr_sem); + + if (!mm->context.ldt) { + retval = 0; + goto out_unlock; + } + + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; + if (entries_size > bytecount) + entries_size = bytecount; + + if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { + retval = -EFAULT; + goto out_unlock; + } + + if (entries_size != bytecount) { + /* Zero-fill the rest and pretend we read bytecount bytes. */ + if (clear_user(ptr + entries_size, bytecount - entries_size)) { + retval = -EFAULT; + goto out_unlock; + } + } + retval = bytecount; + +out_unlock: + up_read(&mm->context.ldt_usr_sem); + return retval; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + return false; + } +#endif + + return true; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int old_nr_entries, new_nr_entries; + struct user_desc ldt_info; + struct desc_struct ldt; + int error; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || + LDT_empty(&ldt_info)) { + /* The user wants to clear the entry. */ + memset(&ldt, 0, sizeof(ldt)); + } else { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { + error = -EINVAL; + goto out; + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + } + + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; + + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; + new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); + + error = -ENOMEM; + new_ldt = alloc_ldt_struct(new_nr_entries); + if (!new_ldt) + goto out_unlock; + + if (old_ldt) + memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); + + new_ldt->entries[ldt_info.entry_number] = ldt; + finalize_ldt_struct(new_ldt); + + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + + install_ldt(mm, new_ldt); + unmap_ldt_struct(mm, old_ldt); + free_ldt_struct(old_ldt); + error = 0; + +out_unlock: + up_write(&mm->context.ldt_usr_sem); +out: + return error; +} + +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c new file mode 100644 index 000000000..64b00b0d7 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_32.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void set_gdt(void *newgdt, __u16 limit) +{ + struct desc_ptr curgdt; + + /* ia32 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + load_gdt(&curgdt); +} + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%ss\n" + : : : "eax", "memory"); +#undef STR +#undef __STR +} + +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + image->arch.pgd = NULL; +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + image->arch.pmd0 = NULL; + free_page((unsigned long)image->arch.pmd1); + image->arch.pmd1 = NULL; +#endif + free_page((unsigned long)image->arch.pte0); + image->arch.pte0 = NULL; + free_page((unsigned long)image->arch.pte1); + image->arch.pte1 = NULL; +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + p4d_t *p4d; + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = NULL; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * - Make control page executable. + * - Allocate page tables + * - Setup page tables + */ +int machine_kexec_prepare(struct kimage *image) +{ + int error; + + set_memory_x((unsigned long)page_address(image->control_code_page), 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ + set_memory_nx((unsigned long)page_address(image->control_code_page), 1); + machine_kexec_free_page_tables(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. + */ + clear_IO_APIC(); + restore_boot_irq_mode(); +#endif + } + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + relocate_kernel_ptr = control_page; + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_PGD] = __pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); + + /* now call it */ + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, + boot_cpu_has(X86_FEATURE_PAE), + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c new file mode 100644 index 000000000..a29a44a98 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_64.c @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_ACPI +/* + * Used while adding mapping for ACPI tables. + * Can be reused when other iomem regions need be mapped + */ +struct init_pgtable_data { + struct x86_mapping_info *info; + pgd_t *level4p; +}; + +static int mem_region_callback(struct resource *res, void *arg) +{ + struct init_pgtable_data *data = arg; + unsigned long mstart, mend; + + mstart = res->start; + mend = mstart + resource_size(res) - 1; + + return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend); +} + +static int +map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) +{ + struct init_pgtable_data data; + unsigned long flags; + int ret; + + data.info = info; + data.level4p = level4p; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + /* ACPI tables could be located in ACPI Non-volatile Storage region */ + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + return 0; +} +#else +static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } +#endif + +#ifdef CONFIG_KEXEC_FILE +const struct kexec_file_ops * const kexec_file_loaders[] = { + &kexec_bzImage64_ops, + NULL +}; +#endif + +static int +map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) +{ +#ifdef CONFIG_EFI + unsigned long mstart, mend; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + mstart = (boot_params.efi_info.efi_systab | + ((u64)boot_params.efi_info.efi_systab_hi<<32)); + + if (efi_enabled(EFI_64BIT)) + mend = mstart + sizeof(efi_system_table_64_t); + else + mend = mstart + sizeof(efi_system_table_32_t); + + if (!mstart) + return 0; + + return kernel_ident_mapping_init(info, level4p, mstart, mend); +#endif + return 0; +} + +static void free_transition_pgtable(struct kimage *image) +{ + free_page((unsigned long)image->arch.p4d); + image->arch.p4d = NULL; + free_page((unsigned long)image->arch.pud); + image->arch.pud = NULL; + free_page((unsigned long)image->arch.pmd); + image->arch.pmd = NULL; + free_page((unsigned long)image->arch.pte); + image->arch.pte = NULL; +} + +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +{ + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + int result = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = (unsigned long)relocate_kernel; + paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + pgd += pgd_index(vaddr); + if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); + if (!pud) + goto err; + image->arch.pud = pud; + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(p4d, vaddr); + if (!pud_present(*pud)) { + pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd) + goto err; + image->arch.pmd = pmd; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, vaddr); + if (!pmd_present(*pmd)) { + pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pte) + goto err; + image->arch.pte = pte; + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + } + pte = pte_offset_kernel(pmd, vaddr); + + if (sev_active()) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + return 0; +err: + return result; +} + +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, + }; + unsigned long mstart, mend; + pgd_t *level4p; + int result; + int i; + + level4p = (pgd_t *)__va(start_pgtable); + clear_page(level4p); + + if (sev_active()) { + info.page_flag |= _PAGE_ENC; + info.kernpg_flag |= _PAGE_ENC; + } + + if (direct_gbpages) + info.direct_gbpages = true; + + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + + /* + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. + */ + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + + /* + * Prepare EFI systab and ACPI tables for kexec kernel since they are + * not covered by pfn_mapped. + */ + result = map_efi_systab(&info, level4p); + if (result) + return result; + + result = map_acpi_tables(&info, level4p); + if (result) + return result; + + return init_transition_pgtable(image, level4p); +} + +static void set_idt(void *newidt, u16 limit) +{ + struct desc_ptr curidt; + + /* x86-64 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + __asm__ __volatile__ ( + "lidtq %0\n" + : : "m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + struct desc_ptr curgdt; + + /* x86-64 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + __asm__ __volatile__ ( + "lgdtq %0\n" + : : "m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) : "memory" + ); +} + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + free_transition_pgtable(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to restore_boot_irq_mode() + * in one form or other. kexec jump path also need one. + */ + clear_IO_APIC(); + restore_boot_irq_mode(); +#endif + } + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); + + /* now call it */ + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context, + sme_active()); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} + +/* arch-dependent functionality related to kexec file-based syscall */ + +#ifdef CONFIG_KEXEC_FILE +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +/* + * Apply purgatory relocations. + * + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtabsec: Corresponding symtab. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, const Elf_Shdr *relsec, + const Elf_Shdr *symtabsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + rel = (void *)pi->ehdr + relsec->sh_offset; + + pr_debug("Applying relocate section %s to %u\n", + shstrtab + relsec->sh_name, relsec->sh_info); + + for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update. This is temporary buffer + * where section is currently loaded. This will finally be + * loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = pi->purgatory_buf; + location += section->sh_offset; + location += rel[i].r_offset; + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (void *)pi->ehdr + symtabsec->sh_offset; + sym += ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= pi->ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + case R_X86_64_PLT32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} +#endif /* CONFIG_KEXEC_FILE */ + +static int +kexec_mark_range(unsigned long start, unsigned long end, bool protect) +{ + struct page *page; + unsigned int nr_pages; + + /* + * For physical range: [start, end]. We must skip the unassigned + * crashk resource with zero-valued "end" member. + */ + if (!end || start > end) + return 0; + + page = pfn_to_page(start >> PAGE_SHIFT); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + if (protect) + return set_pages_ro(page, nr_pages); + else + return set_pages_rw(page, nr_pages); +} + +static void kexec_mark_crashkres(bool protect) +{ + unsigned long control; + + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); + + /* Don't touch the control code page used in crash_kexec().*/ + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); + /* Control code page is located in the 2nd page. */ + kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + control += KEXEC_CONTROL_PAGE_SIZE; + kexec_mark_range(control, crashk_res.end, protect); +} + +void arch_kexec_protect_crashkres(void) +{ + kexec_mark_crashkres(true); +} + +void arch_kexec_unprotect_crashkres(void) +{ + kexec_mark_crashkres(false); +} + +/* + * During a traditional boot under SME, SME will encrypt the kernel, + * so the SME kexec kernel also needs to be un-encrypted in order to + * replicate a normal SME boot. + * + * During a traditional boot under SEV, the kernel has already been + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in + * order to replicate a normal SEV boot. + */ +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + if (sev_active()) + return 0; + + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + if (sev_active()) + return; + + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c new file mode 100644 index 000000000..b5cb49e57 --- /dev/null +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Family 10h mmconfig enablement + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static u64 fam10h_pci_mmconf_base; + +static struct pci_hostbridge_probe pci_probes[] = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, +}; + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + int start1, start2; + + start1 = r1->start >> 32; + start2 = r2->start >> 32; + + return start1 - start2; +} + +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ +#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) +static void get_fam10h_pci_mmconf_base(void) +{ + int i; + unsigned bus; + unsigned slot; + int found; + + u64 val; + u32 address; + u64 tom2; + u64 base = FAM10H_PCI_MMCONF_BASE; + + int hi_mmio_num; + struct range range[8]; + + /* only try to get setting from BSP */ + if (fam10h_pci_mmconf_base) + return; + + if (!early_pci_allowed()) + return; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + return; + + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + + /* TOP_MEM2 is not enabled? */ + if (!(val & (1<<21))) { + tom2 = 1ULL << 32; + } else { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); + } + + if (base <= tom2) + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; + + /* + * need to check if the range is in the high mmio range that is + * above 4G + */ + hi_mmio_num = 0; + for (i = 0; i < 8; i++) { + u32 reg; + u64 start; + u64 end; + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ + + if (end < tom2) + continue; + + range[hi_mmio_num].start = start; + range[hi_mmio_num].end = end; + hi_mmio_num++; + } + + if (!hi_mmio_num) + goto out; + + /* sort the range */ + sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); + + if (range[hi_mmio_num - 1].end < base) + goto out; + if (range[0].start > base + MMCONF_SIZE) + goto out; + + /* need to find one window */ + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; + if ((base > tom2) && BASE_VALID(base)) + goto out; + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) + goto out; + /* need to find window between ranges */ + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; + } + return; + +out: + fam10h_pci_mmconf_base = base; +} + +void fam10h_check_enable_mmcfg(void) +{ + u64 val; + u32 address; + + if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) + return; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, val); + + /* try to make sure that AP's setting is identical to BSP setting */ + if (val & FAM10H_MMIO_CONF_ENABLE) { + unsigned busnbits; + busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* only trust the one handle 256 buses, if acpi=off */ + if (!acpi_pci_disabled || busnbits >= 8) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { + fam10h_pci_mmconf_base = base; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } + } + + /* + * if it is not enabled, try to enable it and assume only one segment + * with 256 buses + */ + get_fam10h_pci_mmconf_base(); + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; + return; + } + + printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); + val &= ~((FAM10H_MMIO_CONF_BASE_MASK< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if 0 +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) +#else +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) +#endif + +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static unsigned long int get_module_load_offset(void) +{ + if (kaslr_enabled()) { + mutex_lock(&module_kaslr_mutex); + /* + * Calculate the module_load_offset the first time this + * code is called. Once calculated it stays the same until + * reboot. + */ + if (module_load_offset == 0) + module_load_offset = + (get_random_int() % 1024 + 1) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + } + return module_load_offset; +} +#else +static unsigned long int get_module_load_offset(void) +{ + return 0; +} +#endif + +void *module_alloc(unsigned long size) +{ + void *p; + + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + + p = __vmalloc_node_range(size, MODULE_ALIGN, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { + vfree(p); + return NULL; + } + + return p; +} + +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + case R_386_PLT32: + /* Add the value, subtract its position */ + *location += sym->st_value - (uint32_t)location; + break; + default: + pr_err("%s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} +#else /*X86_64*/ +static int __apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + if (*(u64 *)loc != 0) + goto invalid_relocation; + write(loc, &val, 8); + break; + case R_X86_64_32: + if (*(u32 *)loc != 0) + goto invalid_relocation; + write(loc, &val, 4); + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + if (*(s32 *)loc != 0) + goto invalid_relocation; + write(loc, &val, 4); + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + case R_X86_64_PLT32: + if (*(u32 *)loc != 0) + goto invalid_relocation; + val -= (u64)loc; + write(loc, &val, 4); +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + case R_X86_64_PC64: + if (*(u64 *)loc != 0) + goto invalid_relocation; + val -= (u64)loc; + write(loc, &val, 8); + break; + default: + pr_err("%s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +invalid_relocation: + pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + return -ENOEXEC; + +overflow: + pr_err("overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + int ret; + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) { + write = text_poke; + mutex_lock(&text_mutex); + } + + ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); + + if (!early) { + text_poke_sync(); + mutex_unlock(&text_mutex); + } + + return ret; +} + +#endif + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL, *returns = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; + if (!strcmp(".return_sites", secstrings + s->sh_name)) + returns = s; + } + + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } + if (returns) { + void *rseg = (void *)returns->sh_addr; + apply_returns(rseg, rseg + returns->sh_size); + } + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + /* make jump label nops */ + jump_label_apply_nops(me); + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); +} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c new file mode 100644 index 000000000..8f06449aa --- /dev/null +++ b/arch/x86/kernel/mpparse.c @@ -0,0 +1,944 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static void __init MP_processor_info(struct mpc_cpu *m) +{ + int apicid; + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + apicid = m->apicid; + + if (m->cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->apicid; + } + + pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); +} + +#ifdef CONFIG_X86_IO_APIC +static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + set_bit(m->busid, mp_bus_not_pci); + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + clear_bit(m->busid, mp_bus_not_pci); +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; +#endif + } else + pr_warn("Unknown bustype %s - ignoring\n", str); +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_LEGACY, + .ops = &mp_ioapic_irqdomain_ops, + }; + + if (m->flags & MPC_APIC_USABLE) + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, + "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +#endif /* CONFIG_X86_IO_APIC */ + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, + "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + pr_err("MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + pr_err("MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec); + return 0; + } + if (!mpc->lapic) { + pr_err("MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + pr_info("MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + pr_info("MPTABLE: Product ID: %s\n", str); + + pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic); + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + pr_err("Your mptable is wrong, contact your HW vendor!\n"); + pr_cont("type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); + + if (early) + return 1; + + /* Now process the configuration blocks. */ + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + mp_save_irq((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + } + + if (!num_processors) + pr_err("MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; + intsrc.srcbus = 0; + intsrc.dstapic = mpc_ioapic_id(0); + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + pr_err("ELCR contains invalid data... not using ELCR\n"); + else { + pr_info("Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + fallthrough; + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) { + intsrc.irqflag = MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_HIGH; + } else { + intsrc.irqflag = MP_IRQTRIG_DEFAULT | + MP_IRQPOL_DEFAULT; + } + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + mp_save_irq(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + mp_save_irq(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + pr_err("???\nUnknown standard configuration %d\n", + mpc_default_type); + fallthrough; + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping; + processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static unsigned long mpf_base; +static bool mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_memremap(physptr, PAGE_SIZE); + size = mpc->length; + early_memunmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + pr_err("BIOS bug, MP table errors detected!...\n"); + pr_cont("... disabling SMP support. (tell your hw vendor)\n"); + early_memunmap(mpc, size); + return -1; + } + early_memunmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf; + + if (!smp_found_config) + return; + + if (!mpf_found) + return; + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + + pr_info("Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) + if (mpf->feature2 & (1 << 7)) { + pr_info(" IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + pr_info(" Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1) { + if (early) { + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + goto out; + } + + pr_info("Default MP configuration #%d\n", mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + goto out; + } else + BUG(); + + if (!early) + pr_info("Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +out: + early_memunmap(mpf, sizeof(*mpf)); +} + +static void __init smp_reserve_memory(struct mpf_intel *mpf) +{ + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); +} + +static int __init smp_scan_config(unsigned long base, unsigned long length) +{ + unsigned int *bp; + struct mpf_intel *mpf; + int ret = 0; + + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + bp = early_memremap(base, length); + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_base = base; + mpf_found = true; + + pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n", + base, base + sizeof(*mpf) - 1); + + memblock_reserve(base, sizeof(*mpf)); + if (mpf->physptr) + smp_reserve_memory(mpf); + + ret = 1; + } + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; + length -= 16; + } + return ret; +} + +void __init default_find_smp_config(void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW)) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_mp_irq_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + memcpy(m, &mp_irqs[i], sizeof(*m)); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} + +static int __init +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return 0; +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + pr_info("mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + memcpy(m, &mp_irqs[i], sizeof(*m)); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init e820__memblock_alloc_reserved_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4); +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + unsigned long size; + + if (!enable_update_mptable) + return 0; + + if (!mpf_found) + return 0; + + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + + /* + * Now see if we need to go further. + */ + if (mpf->feature1) + goto do_unmap_mpf; + + if (!mpf->physptr) + goto do_unmap_mpf; + + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } + + if (!smp_check_mpc(mpc, oem, str)) + goto do_unmap_mpc; + + pr_info("mpf: %llx\n", (u64)mpf_base); + pr_info("physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the position */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + pr_info("mpc is readonly, please try alloc_mptable instead\n"); + goto do_unmap_mpc; + } + pr_info("use in-position replacing\n"); + } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } + mpf->physptr = mpc_new_phys; + memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); + mpc = mpc_new; + size = mpc_new_length; + /* check if we can modify that */ + if (mpc_new_phys - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } + pr_info("mpf new: %x\n", 0x400 - 16); + memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); + mpf = mpf_new; + mpf->physptr = mpc_new_phys; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + pr_info("physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c new file mode 100644 index 000000000..79f900ffd --- /dev/null +++ b/arch/x86/kernel/msr.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static struct class *msr_class; +static enum cpuhp_state cpuhp_msr_state; + +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + +static ssize_t msr_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); + if (err) + break; + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static int filter_write(u32 reg) +{ + /* + * MSRs writes usually happen all at once, and can easily saturate kmsg. + * Only allow one message every 30 seconds. + * + * It's possible to be smarter here and do it (for example) per-MSR, but + * it would certainly be more complex, and this is enough at least to + * avoid saturating the ring buffer. + */ + static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1); + + switch (allow_writes) { + case MSR_WRITES_ON: return 0; + case MSR_WRITES_OFF: return -EPERM; + default: break; + } + + if (!__ratelimit(&fw_rs)) + return 0; + + if (reg == MSR_IA32_ENERGY_PERF_BIAS) + return 0; + + pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to x86@kernel.org.\n", + reg, current->comm, current->pid); + + return 0; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + + err = filter_write(reg); + if (err) + return err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); + if (err) + break; + + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file_inode(file)); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof(regs))) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof(regs))) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof(regs))) { + err = -EFAULT; + break; + } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof(regs))) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file_inode(file)); + struct cpuinfo_x86 *c; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = no_seek_end_llseek, + .read = msr_read, + .write = msr_write, + .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, +}; + +static int msr_device_create(unsigned int cpu) +{ + struct device *dev; + + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); + return PTR_ERR_OR_ZERO(dev); +} + +static int msr_device_destroy(unsigned int cpu) +{ + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; +} + +static char *msr_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static int __init msr_init(void) +{ + int err; + + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { + pr_err("unable to get major %d for msr\n", MSR_MAJOR); + return -EBUSY; + } + msr_class = class_create(THIS_MODULE, "msr"); + if (IS_ERR(msr_class)) { + err = PTR_ERR(msr_class); + goto out_chrdev; + } + msr_class->devnode = msr_devnode; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; + +out_class: + class_destroy(msr_class); +out_chrdev: + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); + return err; +} +module_init(msr_init); + +static void __exit msr_exit(void) +{ + cpuhp_remove_state(cpuhp_msr_state); + class_destroy(msr_class); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); +} +module_exit(msr_exit) + +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 000000000..2ef961cf4 --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2011 Don Zickus Red Hat, Inc. + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +struct nmi_desc { + raw_spinlock_t lock; + struct list_head head; +}; + +static struct nmi_desc nmi_desc[NMI_MAX] = +{ + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .head = LIST_HEAD_INIT(nmi_desc[0].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .head = LIST_HEAD_INIT(nmi_desc[1].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, + +}; + +struct nmi_stats { + unsigned int normal; + unsigned int unknown; + unsigned int external; + unsigned int swallow; +}; + +static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); + +static int ignore_nmis __read_mostly; + +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + +#define nmi_to_desc(type) (&nmi_desc[type]) + +static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; + +static int __init nmi_warning_debugfs(void) +{ + debugfs_create_u64("nmi_longest_ns", 0644, + arch_debugfs_dir, &nmi_longest_ns); + return 0; +} +fs_initcall(nmi_warning_debugfs); + +static void nmi_check_duration(struct nmiaction *action, u64 duration) +{ + int remainder_ns, decimal_msecs; + + if (duration < nmi_longest_ns || duration < action->max_duration) + return; + + action->max_duration = duration; + + remainder_ns = do_div(duration, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + action->handler, duration, decimal_msecs); +} + +static int nmi_handle(unsigned int type, struct pt_regs *regs) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *a; + int handled=0; + + rcu_read_lock(); + + /* + * NMIs are edge-triggered, which means if you have enough + * of them concurrently, you can lose some because only one + * can be latched at any given time. Walk the whole list + * to handle those situations. + */ + list_for_each_entry_rcu(a, &desc->head, list) { + int thishandled; + u64 delta; + + delta = sched_clock(); + thishandled = a->handler(type, regs); + handled += thishandled; + delta = sched_clock() - delta; + trace_nmi_handler(a->handler, (int)delta, thishandled); + + nmi_check_duration(a, delta); + } + + rcu_read_unlock(); + + /* return total number of NMI events handled */ + return handled; +} +NOKPROBE_SYMBOL(nmi_handle); + +int __register_nmi_handler(unsigned int type, struct nmiaction *action) +{ + struct nmi_desc *desc = nmi_to_desc(type); + unsigned long flags; + + if (!action->handler) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* + * Indicate if there are multiple registrations on the + * internal NMI handler call chains (SERR and IO_CHECK). + */ + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); + + /* + * some handlers need to be executed first otherwise a fake + * event confuses some handlers (kdump uses this flag) + */ + if (action->flags & NMI_FLAG_FIRST) + list_add_rcu(&action->list, &desc->head); + else + list_add_tail_rcu(&action->list, &desc->head); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +EXPORT_SYMBOL(__register_nmi_handler); + +void unregister_nmi_handler(unsigned int type, const char *name) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *n; + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + + list_for_each_entry_rcu(n, &desc->head, list) { + /* + * the name passed in to describe the nmi handler + * is used as the lookup key + */ + if (!strcmp(n->name, name)) { + WARN(in_nmi(), + "Trying to free NMI (%s) from NMI context!\n", n->name); + list_del_rcu(&n->list); + break; + } + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(unregister_nmi_handler); + +static void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs)) + return; + + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + if (panic_on_unrecovered_nmi) + nmi_panic(regs, "NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); + + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(pci_serr_error); + +static void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs)) + return; + + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + show_regs(regs); + + if (panic_on_io_nmi) { + nmi_panic(regs, "NMI IOCK error: Not continuing"); + + /* + * If we end up here, it means we have received an NMI while + * processing panic(). Simply return without delaying and + * re-enabling NMIs. + */ + return; + } + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(io_check_error); + +static void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + int handled; + + /* + * Use 'false' as back-to-back NMIs are dealt with one level up. + * Of course this makes having multiple 'unknown' handlers useless + * as only the first one is ever run (unless it can actually determine + * if it caused the NMI) + */ + handled = nmi_handle(NMI_UNKNOWN, regs); + if (handled) { + __this_cpu_add(nmi_stats.unknown, handled); + return; + } + + __this_cpu_add(nmi_stats.unknown, 1); + + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + pr_emerg("Do you have a strange power saving mode enabled?\n"); + if (unknown_nmi_panic || panic_on_unrecovered_nmi) + nmi_panic(regs, "NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); +} +NOKPROBE_SYMBOL(unknown_nmi_error); + +static DEFINE_PER_CPU(bool, swallow_nmi); +static DEFINE_PER_CPU(unsigned long, last_nmi_rip); + +static noinstr void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int handled; + bool b2b = false; + + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + + /* + * Back-to-back NMIs are interesting because they can either + * be two NMI or more than two NMIs (any thing over two is dropped + * due to NMI being edge-triggered). If this is the second half + * of the back-to-back NMI, assume we dropped things and process + * more handlers. Otherwise reset the 'swallow' NMI behaviour + */ + if (regs->ip == __this_cpu_read(last_nmi_rip)) + b2b = true; + else + __this_cpu_write(swallow_nmi, false); + + __this_cpu_write(last_nmi_rip, regs->ip); + + instrumentation_begin(); + + handled = nmi_handle(NMI_LOCAL, regs); + __this_cpu_add(nmi_stats.normal, handled); + if (handled) { + /* + * There are cases when a NMI handler handles multiple + * events in the current NMI. One of these events may + * be queued for in the next NMI. Because the event is + * already handled, the next NMI will result in an unknown + * NMI. Instead lets flag this for a potential NMI to + * swallow. + */ + if (handled > 1) + __this_cpu_write(swallow_nmi, true); + goto out; + } + + /* + * Non-CPU-specific NMI: NMI sources can be processed on any CPU. + * + * Another CPU may be processing panic routines while holding + * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping, + * and if so, call its callback directly. If there is no CPU preparing + * crash dump, we simply loop here. + */ + while (!raw_spin_trylock(&nmi_reason_lock)) { + run_crash_ipi_callback(regs); + cpu_relax(); + } + + reason = x86_platform.get_nmi_reason(); + + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: + */ + reassert_nmi(); +#endif + __this_cpu_add(nmi_stats.external, 1); + raw_spin_unlock(&nmi_reason_lock); + goto out; + } + raw_spin_unlock(&nmi_reason_lock); + + /* + * Only one NMI can be latched at a time. To handle + * this we may process multiple nmi handlers at once to + * cover the case where an NMI is dropped. The downside + * to this approach is we may process an NMI prematurely, + * while its real NMI is sitting latched. This will cause + * an unknown NMI on the next run of the NMI processing. + * + * We tried to flag that condition above, by setting the + * swallow_nmi flag when we process more than one event. + * This condition is also only present on the second half + * of a back-to-back NMI, so we flag that condition too. + * + * If both are true, we assume we already processed this + * NMI previously and we swallow it. Otherwise we reset + * the logic. + * + * There are scenarios where we may accidentally swallow + * a 'real' unknown NMI. For example, while processing + * a perf NMI another perf NMI comes in along with a + * 'real' unknown NMI. These two NMIs get combined into + * one (as described above). When the next NMI gets + * processed, it will be flagged by perf as handled, but + * no one will know that there was a 'real' unknown NMI sent + * also. As a result it gets swallowed. Or if the first + * perf NMI returns two events handled then the second + * NMI will get eaten by the logic below, again losing a + * 'real' unknown NMI. But this is the best we can do + * for now. + */ + if (b2b && __this_cpu_read(swallow_nmi)) + __this_cpu_add(nmi_stats.swallow, 1); + else + unknown_nmi_error(reason, regs); + +out: + instrumentation_end(); +} + +/* + * NMIs can page fault or hit breakpoints which will cause it to lose + * its NMI context with the CPU when the breakpoint or page fault does an IRET. + * + * As a result, NMIs can nest if NMIs get unmasked due an IRET during + * NMI processing. On x86_64, the asm glue protects us from nested NMIs + * if the outer NMI came from kernel mode, but we can still nest if the + * outer NMI came from user mode. + * + * To handle these nested NMIs, we have three states: + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI executes an iret, another NMI can preempt it. We do not + * want to allow this new NMI to run, but we want to execute it when the + * first one finishes. We set the state to "latched", and the exit of + * the first NMI will perform a dec_return, if the result is zero + * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the + * dec_return would have set the state to NMI_EXECUTING (what we want it + * to be when we are running). In this case, we simply jump back to + * rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. + */ +enum nmi_states { + NMI_NOT_RUNNING = 0, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); +static DEFINE_PER_CPU(unsigned long, nmi_dr7); + +DEFINE_IDTENTRY_RAW(exc_nmi) +{ + irqentry_state_t irq_state; + + /* + * Re-enable NMIs right here when running as an SEV-ES guest. This might + * cause nested NMIs, but those can be handled safely. + */ + sev_es_nmi_complete(); + + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) + return; + + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { + this_cpu_write(nmi_state, NMI_LATCHED); + return; + } + this_cpu_write(nmi_state, NMI_EXECUTING); + this_cpu_write(nmi_cr2, read_cr2()); +nmi_restart: + + /* + * Needs to happen before DR7 is accessed, because the hypervisor can + * intercept DR7 reads/writes, turning those into #VC exceptions. + */ + sev_es_ist_enter(regs); + + this_cpu_write(nmi_dr7, local_db_save()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + + if (!ignore_nmis) + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + local_db_restore(this_cpu_read(nmi_dr7)); + + sev_es_ist_exit(); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; + + if (user_mode(regs)) + mds_user_clear_cpu_buffers(); +} + +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_noist) +{ + exc_nmi(regs); +} +#endif +#if IS_MODULE(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +#endif + +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} + +/* reset the back-to-back NMI logic */ +void local_touch_nmi(void) +{ + __this_cpu_write(last_nmi_rip, 0); +} +EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 000000000..a1a96df3d --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int __initdata nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; + +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; + +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void __init init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); +} + +static void __init cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void __init test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && --timeout) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void __init remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init reset_nmi(void) +{ + nmi_fail = 0; +} + +static void __init dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk(KERN_CONT "FAILED |"); + else if (nmi_fail == TIMEOUT) + printk(KERN_CONT "TIMEOUT|"); + else + printk(KERN_CONT "ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(KERN_CONT " ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void __init print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void __init nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk(KERN_CONT "\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk(KERN_CONT "\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000..4f75d0cf6 --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include +#include + +#include + +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_ops.lock.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} + +__visible bool __native_vcpu_is_preempted(long cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_ops.lock.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c new file mode 100644 index 000000000..e21937680 --- /dev/null +++ b/arch/x86/kernel/paravirt.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * nop stub, which must not clobber anything *including the stack* to + * avoid confusing the entry prologues. + */ +extern void _paravirt_nop(void); +asm (".pushsection .entry.text, \"ax\"\n" + ".global _paravirt_nop\n" + "_paravirt_nop:\n\t" + ASM_RET + ".size _paravirt_nop, . - _paravirt_nop\n\t" + ".type _paravirt_nop, @function\n\t" + ".popsection"); + +void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + pv_info.name); +} + +/* Undefined instruction for dealing with missing ops pointers. */ +static const unsigned char ud2a[] = { 0x0f, 0x0b }; + +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + +static unsigned paravirt_patch_call(void *insn_buff, const void *target, + unsigned long addr, unsigned len) +{ + const int call_len = 5; + struct branch *b = insn_buff; + unsigned long delta = (unsigned long)target - (addr+call_len); + + if (len < call_len) { + pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr); + /* Kernel might not be viable if patching fails, bail out: */ + BUG_ON(1); + } + + b->opcode = 0xe8; /* call */ + b->delta = delta; + BUILD_BUG_ON(sizeof(*b) != call_len); + + return call_len; +} + +#ifdef CONFIG_PARAVIRT_XXL +/* identity function, which can be inlined */ +u64 notrace _paravirt_ident_64(u64 x) +{ + return x; +} + +static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, + unsigned long addr, unsigned len) +{ + struct branch *b = insn_buff; + unsigned long delta = (unsigned long)target - (addr+5); + + if (len < 5) { +#ifdef CONFIG_RETPOLINE + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); +#endif + return len; /* call too long for patch site */ + } + + b->opcode = 0xe9; /* jmp */ + b->delta = delta; + + return 5; +} +#endif + +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void __init native_pv_lock_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_disable(&virt_spin_lock_key); +} + +unsigned paravirt_patch_default(u8 type, void *insn_buff, + unsigned long addr, unsigned len) +{ + /* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ + void *opfunc = *((void **)&pv_ops + type); + unsigned ret; + + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + else if (opfunc == _paravirt_nop) + ret = 0; + +#ifdef CONFIG_PARAVIRT_XXL + /* identity functions just return their single argument */ + else if (opfunc == _paravirt_ident_64) + ret = paravirt_patch_ident_64(insn_buff, len); + + else if (type == PARAVIRT_PATCH(cpu.iret) || + type == PARAVIRT_PATCH(cpu.usergs_sysret64)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); +#endif + else + /* Otherwise call the function. */ + ret = paravirt_patch_call(insn_buff, opfunc, addr, len); + + return ret; +} + +unsigned paravirt_patch_insns(void *insn_buff, unsigned len, + const char *start, const char *end) +{ + unsigned insn_len = end - start; + + /* Alternative instruction is too large for the patch site and we cannot continue: */ + BUG_ON(insn_len > len || start == NULL); + + memcpy(insn_buff, start, insn_len); + + return insn_len; +} + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +/* These are in entry.S */ +extern void native_iret(void); +extern void native_usergs_sysret64(void); + +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + return request_resource(&ioport_resource, &reserve_ioports); +} + +static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + +static inline void enter_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + + this_cpu_write(paravirt_lazy_mode, mode); +} + +static void leave_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); + + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +} + +void paravirt_enter_lazy_mmu(void) +{ + enter_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_leave_lazy_mmu(void) +{ + leave_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + +#ifdef CONFIG_PARAVIRT_XXL +void paravirt_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(PARAVIRT_LAZY_CPU); +} + +void paravirt_end_context_switch(struct task_struct *next) +{ + BUG_ON(preemptible()); + + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); +} +#endif + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void) +{ + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + + return this_cpu_read(paravirt_lazy_mode); +} + +struct pv_info pv_info = { + .name = "bare hardware", +#ifdef CONFIG_PARAVIRT_XXL + .extra_user_64bit_cs = __USER_CS, +#endif +}; + +/* 64-bit pagetable entries */ +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) + +struct paravirt_patch_template pv_ops = { + /* Init ops. */ + .init.patch = native_patch, + + /* Time ops. */ + .time.sched_clock = native_sched_clock, + .time.steal_clock = native_steal_clock, + + /* Cpu ops. */ + .cpu.io_delay = native_io_delay, + +#ifdef CONFIG_PARAVIRT_XXL + .cpu.cpuid = native_cpuid, + .cpu.get_debugreg = native_get_debugreg, + .cpu.set_debugreg = native_set_debugreg, + .cpu.read_cr0 = native_read_cr0, + .cpu.write_cr0 = native_write_cr0, + .cpu.write_cr4 = native_write_cr4, + .cpu.wbinvd = native_wbinvd, + .cpu.read_msr = native_read_msr, + .cpu.write_msr = native_write_msr, + .cpu.read_msr_safe = native_read_msr_safe, + .cpu.write_msr_safe = native_write_msr_safe, + .cpu.read_pmc = native_read_pmc, + .cpu.load_tr_desc = native_load_tr_desc, + .cpu.set_ldt = native_set_ldt, + .cpu.load_gdt = native_load_gdt, + .cpu.load_idt = native_load_idt, + .cpu.store_tr = native_store_tr, + .cpu.load_tls = native_load_tls, + .cpu.load_gs_index = native_load_gs_index, + .cpu.write_ldt_entry = native_write_ldt_entry, + .cpu.write_gdt_entry = native_write_gdt_entry, + .cpu.write_idt_entry = native_write_idt_entry, + + .cpu.alloc_ldt = paravirt_nop, + .cpu.free_ldt = paravirt_nop, + + .cpu.load_sp0 = native_load_sp0, + + .cpu.usergs_sysret64 = native_usergs_sysret64, + .cpu.iret = native_iret, + +#ifdef CONFIG_X86_IOPL_IOPERM + .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, + .cpu.update_io_bitmap = native_tss_update_io_bitmap, +#endif + + .cpu.start_context_switch = paravirt_nop, + .cpu.end_context_switch = paravirt_nop, + + /* Irq ops. */ + .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.safe_halt = native_safe_halt, + .irq.halt = native_halt, +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Mmu ops. */ + .mmu.flush_tlb_user = native_flush_tlb_local, + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_others = native_flush_tlb_others, + .mmu.tlb_remove_table = + (void (*)(struct mmu_gather *, void *))tlb_remove_page, + + .mmu.exit_mmap = paravirt_nop, + +#ifdef CONFIG_PARAVIRT_XXL + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), + .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr3 = __native_read_cr3, + .mmu.write_cr3 = native_write_cr3, + + .mmu.pgd_alloc = __paravirt_pgd_alloc, + .mmu.pgd_free = paravirt_nop, + + .mmu.alloc_pte = paravirt_nop, + .mmu.alloc_pmd = paravirt_nop, + .mmu.alloc_pud = paravirt_nop, + .mmu.alloc_p4d = paravirt_nop, + .mmu.release_pte = paravirt_nop, + .mmu.release_pmd = paravirt_nop, + .mmu.release_pud = paravirt_nop, + .mmu.release_p4d = paravirt_nop, + + .mmu.set_pte = native_set_pte, + .mmu.set_pmd = native_set_pmd, + + .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, + .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .mmu.set_pud = native_set_pud, + + .mmu.pmd_val = PTE_IDENT, + .mmu.make_pmd = PTE_IDENT, + + .mmu.pud_val = PTE_IDENT, + .mmu.make_pud = PTE_IDENT, + + .mmu.set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 + .mmu.p4d_val = PTE_IDENT, + .mmu.make_p4d = PTE_IDENT, + + .mmu.set_pgd = native_set_pgd, +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ + + .mmu.pte_val = PTE_IDENT, + .mmu.pgd_val = PTE_IDENT, + + .mmu.make_pte = PTE_IDENT, + .mmu.make_pgd = PTE_IDENT, + + .mmu.dup_mmap = paravirt_nop, + .mmu.activate_mm = paravirt_nop, + + .mmu.lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + .flush = paravirt_nop, + }, + + .mmu.set_fixmap = native_set_fixmap, +#endif /* CONFIG_PARAVIRT_XXL */ + +#if defined(CONFIG_PARAVIRT_SPINLOCKS) + /* Lock ops. */ +#ifdef CONFIG_SMP + .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .lock.queued_spin_unlock = + PV_CALLEE_SAVE(__native_queued_spin_unlock), + .lock.wait = paravirt_nop, + .lock.kick = paravirt_nop, + .lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__native_vcpu_is_preempted), +#endif /* SMP */ +#endif +}; + +#ifdef CONFIG_PARAVIRT_XXL +/* At this point, native_get/set_debugreg has real function entries */ +NOKPROBE_SYMBOL(native_get_debugreg); +NOKPROBE_SYMBOL(native_set_debugreg); +NOKPROBE_SYMBOL(native_load_idt); +#endif + +EXPORT_SYMBOL(pv_ops); +EXPORT_SYMBOL_GPL(pv_info); diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c new file mode 100644 index 000000000..7c518b08a --- /dev/null +++ b/arch/x86/kernel/paravirt_patch.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include + +#define PSTART(d, m) \ + patch_data_##d.m + +#define PEND(d, m) \ + (PSTART(d, m) + sizeof(patch_data_##d.m)) + +#define PATCH(d, m, insn_buff, len) \ + paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) + +#define PATCH_CASE(ops, m, data, insn_buff, len) \ + case PARAVIRT_PATCH(ops.m): \ + return PATCH(data, ops##_##m, insn_buff, len) + +#ifdef CONFIG_PARAVIRT_XXL +struct patch_xxl { + const unsigned char irq_irq_disable[1]; + const unsigned char irq_irq_enable[1]; + const unsigned char irq_save_fl[2]; + const unsigned char mmu_read_cr2[3]; + const unsigned char mmu_read_cr3[3]; + const unsigned char mmu_write_cr3[3]; + const unsigned char irq_restore_fl[2]; + const unsigned char cpu_wbinvd[2]; + const unsigned char cpu_usergs_sysret64[6]; + const unsigned char mov64[3]; +}; + +static const struct patch_xxl patch_data_xxl = { + .irq_irq_disable = { 0xfa }, // cli + .irq_irq_enable = { 0xfb }, // sti + .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax + .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax + .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax + .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 + .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq + .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd + .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, + 0x48, 0x0f, 0x07 }, // swapgs; sysretq + .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax +}; + +unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) +{ + return PATCH(xxl, mov64, insn_buff, len); +} +# endif /* CONFIG_PARAVIRT_XXL */ + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +struct patch_lock { + unsigned char queued_spin_unlock[3]; + unsigned char vcpu_is_preempted[2]; +}; + +static const struct patch_lock patch_data_lock = { + .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax + +# ifdef CONFIG_X86_64 + .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) +# else + .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) +# endif +}; +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) +{ + switch (type) { + +#ifdef CONFIG_PARAVIRT_XXL + PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); + PATCH_CASE(irq, save_fl, xxl, insn_buff, len); + PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); + PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); + + PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); + PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); + PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); + + PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); + PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); +#endif + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + case PARAVIRT_PATCH(lock.queued_spin_unlock): + if (pv_is_native_spin_unlock()) + return PATCH(lock, queued_spin_unlock, insn_buff, len); + break; + + case PARAVIRT_PATCH(lock.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) + return PATCH(lock, vcpu_is_preempted, insn_buff, len); + break; +#endif + default: + break; + } + + return paravirt_patch_default(type, insn_buff, addr, len); +} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c new file mode 100644 index 000000000..de234e7a8 --- /dev/null +++ b/arch/x86/kernel/pci-dma.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static bool disable_dac_quirk __read_mostly; + +const struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif + +int iommu_merge __read_mostly = 0; + +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; + +void __init pci_iommu_alloc(void) +{ + struct iommu_table_entry *p; + + sort_iommu_table(__iommu_table, __iommu_table_end); + check_iommu_entries(__iommu_table, __iommu_table_end); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && p->detect && p->detect() > 0) { + p->flags |= IOMMU_DETECTED; + if (p->early_init) + p->early_init(); + if (p->flags & IOMMU_FINISH_IF_DETECTED) + break; + } + } +} + +/* + * See for the iommu kernel + * parameter documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + pr_warn("forcesac option ignored.\n"); + if (!strncmp(p, "allowdac", 8)) + pr_warn("allowdac option ignored.\n"); + if (!strncmp(p, "nodac", 5)) + pr_warn("nodac option ignored.\n"); + if (!strncmp(p, "usedac", 6)) { + disable_dac_quirk = true; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif + if (!strncmp(p, "pt", 2)) + iommu_set_default_passthrough(true); + if (!strncmp(p, "nopt", 4)) + iommu_set_default_translated(true); + + gart_parse_options(p); + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +static int __init pci_iommu_init(void) +{ + struct iommu_table_entry *p; + + x86_init.iommu.iommu_init(); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && (p->flags & IOMMU_DETECTED) && p->late_init) + p->late_init(); + } + + return 0; +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static int via_no_dac_cb(struct pci_dev *pdev, void *data) +{ + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); + return 0; +} + +static void via_no_dac(struct pci_dev *dev) +{ + if (!disable_dac_quirk) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL); + } +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); +#endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c new file mode 100644 index 000000000..2e9006c1e --- /dev/null +++ b/arch/x86/kernel/pci-iommu_table.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + + +#define DEBUG 1 + +static struct iommu_table_entry * __init +find_dependents_of(struct iommu_table_entry *start, + struct iommu_table_entry *finish, + struct iommu_table_entry *q) +{ + struct iommu_table_entry *p; + + if (!q) + return NULL; + + for (p = start; p < finish; p++) + if (p->detect == q->depend) + return p; + + return NULL; +} + + +void __init sort_iommu_table(struct iommu_table_entry *start, + struct iommu_table_entry *finish) { + + struct iommu_table_entry *p, *q, tmp; + + for (p = start; p < finish; p++) { +again: + q = find_dependents_of(start, finish, p); + /* We are bit sneaky here. We use the memory address to figure + * out if the node we depend on is past our point, if so, swap. + */ + if (q > p) { + tmp = *p; + memmove(p, q, sizeof(*p)); + *q = tmp; + goto again; + } + } + +} + +#ifdef DEBUG +void __init check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ + struct iommu_table_entry *p, *q, *x; + + /* Simple cyclic dependency checker. */ + for (p = start; p < finish; p++) { + q = find_dependents_of(start, finish, p); + x = find_dependents_of(start, finish, q); + if (p == x) { + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", + p->detect, q->detect); + /* Heavy handed way..*/ + x->depend = NULL; + } + } + + for (p = start; p < finish; p++) { + q = find_dependents_of(p, finish, p); + if (q && q > p) { + printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", + p->detect, q->detect); + } + } +} +#else +void __init check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ +} +#endif diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c new file mode 100644 index 000000000..c2cfa5e7c --- /dev/null +++ b/arch/x86/kernel/pci-swiotlb.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +int swiotlb __read_mostly; + +/* + * pci_swiotlb_detect_override - set swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_detect_override(void) +{ + if (swiotlb_force == SWIOTLB_FORCE) + swiotlb = 1; + + return swiotlb; +} +IOMMU_INIT_FINISH(pci_swiotlb_detect_override, + pci_xen_swiotlb_detect, + pci_swiotlb_init, + pci_swiotlb_late_init); + +/* + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. + */ +int __init pci_swiotlb_detect_4gb(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + swiotlb = 1; + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + + return swiotlb; +} +IOMMU_INIT(pci_swiotlb_detect_4gb, + pci_swiotlb_detect_override, + pci_swiotlb_init, + pci_swiotlb_late_init); + +void __init pci_swiotlb_init(void) +{ + if (swiotlb) + swiotlb_init(0); +} + +void __init pci_swiotlb_late_init(void) +{ + /* An IOMMU turned us off. */ + if (!swiotlb) + swiotlb_exit(); + else { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } +} diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c new file mode 100644 index 000000000..4a710ffff --- /dev/null +++ b/arch/x86/kernel/pcspeaker.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); + + return PTR_ERR_OR_ZERO(pd); +} +device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 000000000..f9e5352b3 --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \ + ~((1ULL << PERF_REG_X86_MAX) - 1)) + +#ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} + +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs) +{ + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); + struct pt_regs *user_regs = task_pt_regs(current); + + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + + /* + * If we're in an NMI that interrupted task_pt_regs setup, then + * we can't sample user regs at all. This check isn't really + * sufficient, though, as we could be in an NMI inside an interrupt + * that happened during task_pt_regs setup. + */ + if (regs->sp > (unsigned long)&user_regs->r11 && + regs->sp <= (unsigned long)(user_regs + 1)) { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; + return; + } + + /* + * These registers are always saved on 64-bit syscall entry. + * On 32-bit entry points, they are saved too except r8..r11. + */ + regs_user_copy->ip = user_regs->ip; + regs_user_copy->ax = user_regs->ax; + regs_user_copy->cx = user_regs->cx; + regs_user_copy->dx = user_regs->dx; + regs_user_copy->si = user_regs->si; + regs_user_copy->di = user_regs->di; + regs_user_copy->r8 = user_regs->r8; + regs_user_copy->r9 = user_regs->r9; + regs_user_copy->r10 = user_regs->r10; + regs_user_copy->r11 = user_regs->r11; + regs_user_copy->orig_ax = user_regs->orig_ax; + regs_user_copy->flags = user_regs->flags; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + /* + * Store user space frame-pointer value on sample + * to facilitate stack unwinding for cases when + * user space executable code has such support + * enabled at compile time: + */ + regs_user_copy->bp = user_regs->bp; + + regs_user_copy->bx = -1; + regs_user_copy->r12 = -1; + regs_user_copy->r13 = -1; + regs_user_copy->r14 = -1; + regs_user_copy->r15 = -1; + /* + * For this to be at all useful, we need a reasonable guess for + * the ABI. Be careful: we're in NMI context, and we're + * considering current to be the current task, so we should + * be careful not to look at any other percpu variables that might + * change during context switches. + */ + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; + + regs_user->regs = regs_user_copy; +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 000000000..b348a672f --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include +#include + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; + x86_platform.legacy.rtc = 1; + x86_platform.legacy.warm_reset = 1; + x86_platform.legacy.reserve_bios_regions = 0; + x86_platform.legacy.devices.pnpbios = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.reserve_bios_regions = 1; + break; + case X86_SUBARCH_XEN: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; + case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} + +bool __init x86_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_pnpbios_disabled(); +} +#endif diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c new file mode 100644 index 000000000..23154d24b --- /dev/null +++ b/arch/x86/kernel/pmem.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include +#include +#include + +static int found(struct resource *res, void *data) +{ + return 1; +} + +static __init int register_e820_pmem(void) +{ + struct platform_device *pdev; + int rc; + + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); + if (rc <= 0) + return 0; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; +} +device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c new file mode 100644 index 000000000..9e1def374 --- /dev/null +++ b/arch/x86/kernel/probe_roms.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const void *rom_list) +{ + unsigned short device; + + do { + if (get_kernel_nofault(device, rom_list) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const void *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (get_kernel_nofault(offset, rom + 0x18) != 0) + continue; + + if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0) + continue; + + if (get_kernel_nofault(device, rom + offset + 0x6) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (get_kernel_nofault(list, rom + offset + 0x8) == 0 && + get_kernel_nofault(rev, rom + offset + 0xc) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void __iomem *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (get_kernel_nofault(c, rom + 2) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = resource_size(&extension_rom_resource); + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (get_kernel_nofault(c, rom + 2) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c new file mode 100644 index 000000000..1cba09a9f --- /dev/null +++ b/arch/x86/kernel/process.c @@ -0,0 +1,1005 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "process.h" + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { + .x86_tss = { + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, + +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +#endif + .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, + }, +}; +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); + +DEFINE_PER_CPU(bool, __tss_limit_invalid); +EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); + +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + memcpy(dst, src, arch_task_struct_size); +#ifdef CONFIG_VM86 + dst->thread.vm86 = NULL; +#endif + + return fpu__copy(dst, src); +} + +/* + * Free thread data structures etc.. + */ +void exit_thread(struct task_struct *tsk) +{ + struct thread_struct *t = &tsk->thread; + struct fpu *fpu = &t->fpu; + + if (test_thread_flag(TIF_IO_BITMAP)) + io_bitmap_exit(tsk); + + free_vm86(t); + + fpu__drop(fpu); +} + +static int set_new_tls(struct task_struct *p, unsigned long tls) +{ + struct user_desc __user *utls = (struct user_desc __user *)tls; + + if (in_ia32_syscall()) + return do_set_thread_area(p, -1, utls, 0); + else + return do_set_thread_area_64(p, ARCH_SET_FS, tls); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + struct inactive_task_frame *frame; + struct fork_frame *fork_frame; + struct pt_regs *childregs; + int ret = 0; + + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + + frame->bp = encode_frame_pointer(childregs); + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; + p->thread.io_bitmap = NULL; + p->thread.iopl_warn = 0; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef CONFIG_X86_64 + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); +#else + p->thread.sp0 = (unsigned long) (childregs + 1); + /* + * Clear all status flags including IF and set fixed bit. 64bit + * does not have this initialization as the frame does not contain + * flags. The flags consistency (especially vs. AC) is there + * ensured via objtool, which lacks 32bit support. + */ + frame->flags = X86_EFLAGS_FIXED; +#endif + + /* Kernel thread ? */ + if (unlikely(p->flags & PF_KTHREAD)) { + memset(childregs, 0, sizeof(struct pt_regs)); + kthread_frame_init(frame, sp, arg); + return 0; + } + + frame->bx = 0; + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + +#ifdef CONFIG_X86_32 + task_user_gs(p) = get_user_gs(current_pt_regs()); +#endif + + if (unlikely(p->flags & PF_IO_WORKER)) { + /* + * An IO thread is a user space thread, but it doesn't + * return to ret_after_fork(). + * + * In order to indicate that to tools like gdb, + * we reset the stack and instruction pointers. + * + * It does the same kernel frame setup to return to a kernel + * function that a kernel thread does. + */ + childregs->sp = 0; + childregs->ip = 0; + kthread_frame_init(frame, sp, arg); + return 0; + } + + /* Set a new TLS for the child thread? */ + if (clone_flags & CLONE_SETTLS) + ret = set_new_tls(p, tls); + + if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) + io_bitmap_share(p); + + return ret; +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + flush_ptrace_hw_breakpoint(tsk); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + + fpu__clear_all(&tsk->thread.fpu); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + cr4_set_bits(X86_CR4_TSD); + preempt_enable(); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + cr4_clear_bits(X86_CR4_TSD); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +DEFINE_PER_CPU(u64, msr_misc_features_shadow); + +static void set_cpuid_faulting(bool on) +{ + u64 msrval; + + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} + +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); + } + preempt_enable(); +} + +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); + } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); + + /* + * Don't inherit TIF_SSBD across exec boundary when + * PR_SPEC_DISABLE_NOEXEC is used. + */ + if (test_thread_flag(TIF_SSBD) && + task_spec_ssb_noexec(current)) { + clear_thread_flag(TIF_SSBD); + task_clear_spec_ssb_disable(current); + task_clear_spec_ssb_noexec(current); + speculation_ctrl_update(task_thread_info(current)->flags); + } +} + +#ifdef CONFIG_X86_IOPL_IOPERM +static inline void switch_to_bitmap(unsigned long tifp) +{ + /* + * Invalidate I/O bitmap if the previous task used it. This prevents + * any possible leakage of an active I/O bitmap. + * + * If the next task has an I/O bitmap it will handle it on exit to + * user mode. + */ + if (tifp & _TIF_IO_BITMAP) + tss_invalidate_io_bitmap(); +} + +static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) +{ + /* + * Copy at least the byte range of the incoming tasks bitmap which + * covers the permitted I/O ports. + * + * If the previous task which used an I/O bitmap had more bits + * permitted, then the copy needs to cover those as well so they + * get turned off. + */ + memcpy(tss->io_bitmap.bitmap, iobm->bitmap, + max(tss->io_bitmap.prev_max, iobm->max)); + + /* + * Store the new max and the sequence number of this bitmap + * and a pointer to the bitmap itself. + */ + tss->io_bitmap.prev_max = iobm->max; + tss->io_bitmap.prev_sequence = iobm->sequence; +} + +/** + * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + */ +void native_tss_update_io_bitmap(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + struct thread_struct *t = ¤t->thread; + u16 *base = &tss->x86_tss.io_bitmap_base; + + if (!test_thread_flag(TIF_IO_BITMAP)) { + native_tss_invalidate_io_bitmap(); + return; + } + + if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { + *base = IO_BITMAP_OFFSET_VALID_ALL; + } else { + struct io_bitmap *iobm = t->io_bitmap; + + /* + * Only copy bitmap data when the sequence number differs. The + * update time is accounted to the incoming task. + */ + if (tss->io_bitmap.prev_sequence != iobm->sequence) + tss_copy_io_bitmap(tss, iobm); + + /* Enable the bitmap */ + *base = IO_BITMAP_OFFSET_VALID_MAP; + } + + /* + * Make sure that the TSS limit is covering the IO bitmap. It might have + * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O + * access from user space to trigger a #GP because tbe bitmap is outside + * the TSS limit. + */ + refresh_tss_limit(); +} +#else /* CONFIG_X86_IOPL_IOPERM */ +static inline void switch_to_bitmap(unsigned long tifp) { } +#endif + +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themself to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +/* + * Update the MSRs managing speculation control, during context switch. + * + * tifp: Previous task's thread flags + * tifn: Next task's thread flags + */ +static __always_inline void __speculation_ctrl_update(unsigned long tifp, + unsigned long tifn) +{ + unsigned long tif_diff = tifp ^ tifn; + u64 msr = x86_spec_ctrl_base; + bool updmsr = false; + + lockdep_assert_irqs_disabled(); + + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) + amd_set_ssb_virt_state(tifn); + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) + amd_set_core_ssb_state(tifn); + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); + } + + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ + if (IS_ENABLED(CONFIG_SMP) && + static_branch_unlikely(&switch_to_cond_stibp)) { + updmsr |= !!(tif_diff & _TIF_SPEC_IB); + msr |= stibp_tif_to_spec_ctrl(tifn); + } + + if (updmsr) + update_spec_ctrl_cond(msr); +} + +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) +{ + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { + if (task_spec_ssb_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SSBD); + else + clear_tsk_thread_flag(tsk, TIF_SSBD); + + if (task_spec_ib_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SPEC_IB); + else + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); + } + /* Return the updated threadinfo flags*/ + return task_thread_info(tsk)->flags; +} + +void speculation_ctrl_update(unsigned long tif) +{ + unsigned long flags; + + /* Forced update. Make sure all relevant TIF flags are different */ + local_irq_save(flags); + __speculation_ctrl_update(~tif, tif); + local_irq_restore(flags); +} + +/* Called from seccomp/prctl update */ +void speculation_ctrl_update_current(void) +{ + preempt_disable(); + speculation_ctrl_update(speculation_ctrl_update_tif(current)); + preempt_enable(); +} + +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) +{ + unsigned long tifp, tifn; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + + switch_to_bitmap(tifp); + + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits_irqsoff(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); + + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { + __speculation_ctrl_update(tifp, tifn); + } else { + speculation_ctrl_update_tif(prev_p); + tifn = speculation_ctrl_update_tif(next_p); + + /* Enforce MSR update to ensure consistent state */ + __speculation_ctrl_update(~tifn, tifn); + } + + if ((tifp ^ tifn) & _TIF_SLD) + switch_to_sld(tifn); +} + +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; +EXPORT_SYMBOL(boot_option_idle_override); + +static void (*x86_idle)(void); + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +void arch_cpu_idle_enter(void) +{ + tsc_verify_tsc_adjust(false); + local_touch_nmi(); +} + +void arch_cpu_idle_dead(void) +{ + play_dead(); +} + +/* + * Called from the generic idle code. + */ +void arch_cpu_idle(void) +{ + x86_idle(); +} + +/* + * We use this if we don't have any better idle routine.. + */ +void __cpuidle default_idle(void) +{ + raw_safe_halt(); +} +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) +EXPORT_SYMBOL(default_idle); +#endif + +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) +{ + bool ret = !!x86_idle; + + x86_idle = default_idle; + + return ret; +} +#endif + +void __noreturn stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + set_cpu_online(smp_processor_id(), false); + disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + + /* + * Use wbinvd on processors that support SME. This provides support + * for performing a successful kexec when going from SME inactive + * to SME active (or vice-versa). The cache must be cleared so that + * if there are entries with the same physical address, both with and + * without the encryption bit, they don't race each other when flushed + * and potentially end up with the wrong entry being committed to + * memory. + */ + if (boot_cpu_has(X86_FEATURE_SME)) + native_wbinvd(); + for (;;) { + /* + * Use native_halt() so that memory contents don't change + * (stack usage and variables) after possibly issuing the + * native_wbinvd() above. + */ + native_halt(); + } +} + +/* + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing. + */ +static void amd_e400_idle(void) +{ + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; + } + + tick_broadcast_enter(); + + default_idle(); + + /* + * The switch back from broadcast mode needs to be called with + * interrupts disabled. + */ + raw_local_irq_disable(); + tick_broadcast_exit(); + raw_local_irq_enable(); +} + +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) + return 0; + + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT + * with interrupts enabled and no flags, which is backwards compatible with the + * original MWAIT implementation. + */ +static __cpuidle void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __sti_mwait(0, 0); + else + raw_local_irq_enable(); + } else { + raw_local_irq_enable(); + } + __current_clr_polling(); +} + +void select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); +#endif + if (x86_idle || boot_option_idle_override == IDLE_POLL) + return; + + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { + pr_info("using AMD E400 aware idle routine\n"); + x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; + } else + x86_idle = default_idle; +} + +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) +{ + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); +} + +static int __init idle_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "poll")) { + pr_info("using polling idle threads\n"); + boot_option_idle_override = IDLE_POLL; + cpu_idle_poll_ctrl(true); + } else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + x86_idle = default_idle; + boot_option_idle_override = IDLE_HALT; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. + */ + boot_option_idle_override = IDLE_NOMWAIT; + } else + return -1; + + return 0; +} +early_param("idle", idle_setup); + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + return randomize_page(mm->brk, 0x02000000); +} + +/* + * Called from fs/proc with a reference on @p to find the function + * which called into schedule(). This needs to be done carefully + * because the task might wake up and we might look at a stack + * changing under us. + */ +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long start, bottom, top, sp, fp, ip, ret = 0; + int count = 0; + + if (p == current || p->state == TASK_RUNNING) + return 0; + + if (!try_get_task_stack(p)) + return 0; + + start = (unsigned long)task_stack_page(p); + if (!start) + goto out; + + /* + * Layout of the stack page: + * + * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) + * PADDING + * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING + * stack + * ----------- bottom = start + * + * The tasks stack pointer points at the location where the + * framepointer is stored. The data on the stack is: + * ... IP FP ... IP FP + * + * We need to read FP and IP, so we need to adjust the upper + * bound by another unsigned long. + */ + top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; + top -= 2 * sizeof(unsigned long); + bottom = start; + + sp = READ_ONCE(p->thread.sp); + if (sp < bottom || sp > top) + goto out; + + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); + do { + if (fp < bottom || fp > top) + goto out; + ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } + fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); + } while (count++ < 16 && p->state != TASK_RUNNING); + +out: + put_task_stack(p); + return ret; +} + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h new file mode 100644 index 000000000..1d0797b23 --- /dev/null +++ b/arch/x86/kernel/process.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// +// Code shared between 32 and 64 bit + +#include + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); + +/* + * This needs to be inline to optimize for the common case where no extra + * work needs to be done. + */ +static inline void switch_to_extra(struct task_struct *prev, + struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long prev_tif = task_thread_info(prev)->flags; + + if (IS_ENABLED(CONFIG_SMP)) { + /* + * Avoid __switch_to_xtra() invocation when conditional + * STIBP is disabled and the only different bit is + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not + * in the TIF_WORK_CTXSW masks. + */ + if (!static_branch_likely(&switch_to_cond_stibp)) { + prev_tif &= ~_TIF_SPEC_IB; + next_tif &= ~_TIF_SPEC_IB; + } + } + + /* + * __switch_to_xtra() handles debug registers, i/o bitmaps, + * speculation mitigations etc. + */ + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || + prev_tif & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev, next); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c new file mode 100644 index 000000000..3b4c394a1 --- /dev/null +++ b/arch/x86/kernel/process_32.c @@ -0,0 +1,225 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "process.h" + +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned short gs; + + if (user_mode(regs)) + gs = get_user_gs(regs); + else + savesegment(gs, gs); + + show_ip(regs, log_lvl); + + printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + log_lvl, regs->ax, regs->bx, regs->cx, regs->dx); + printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + log_lvl, regs->si, regs->di, regs->bp, regs->sp); + printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); + + if (mode != SHOW_REGS_ALL) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); + cr4 = __read_cr4(); + printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + log_lvl, cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + log_lvl, d0, d1, d2, d3); + printk("%sDR6: %08lx DR7: %08lx\n", + log_lvl, d6, d7); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + set_user_gs(regs, 0); + regs->fs = 0; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + regs->flags = X86_EFLAGS_IF; +} +EXPORT_SYMBOL_GPL(start_thread); + + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %ax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); + + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. + */ + lazy_save_gs(prev->gs); + + /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + switch_to_extra(prev_p, next_p); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated. + */ + arch_end_context_switch(next_p); + + /* + * Reload esp0 and cpu_current_top_of_stack. This changes + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ + update_task_stack(next_p); + refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + lazy_load_gs(next->gs); + + this_cpu_write(current_task, next_p); + + switch_fpu_finish(next_p); + + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + + return prev_p; +} + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c new file mode 100644 index 000000000..1d8bc4736 --- /dev/null +++ b/arch/x86/kernel/process_64.c @@ -0,0 +1,837 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include +#endif + +#include "process.h" + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, es; + + show_iret_regs(regs, log_lvl); + + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + + printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", + log_lvl, regs->ax, regs->bx, regs->cx); + printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", + log_lvl, regs->dx, regs->si, regs->di); + printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", + log_lvl, regs->bp, regs->r8, regs->r9); + printk("%sR10: %016lx R11: %016lx R12: %016lx\n", + log_lvl, regs->r10, regs->r11, regs->r12); + printk("%sR13: %016lx R14: %016lx R15: %016lx\n", + log_lvl, regs->r13, regs->r14, regs->r15); + + if (mode == SHOW_REGS_SHORT) + return; + + if (mode == SHOW_REGS_USER) { + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + printk("%sFS: %016lx GS: %016lx\n", + log_lvl, fs, shadowgs); + return; + } + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); + cr4 = __read_cr4(); + + printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + log_lvl, fs, fsindex, gs, gsindex, shadowgs); + printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + log_lvl, regs->cs, ds, es, cr0); + printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", + log_lvl, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400))) { + printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", + log_lvl, d0, d1, d2); + printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", + log_lvl, d3, d6, d7); + } + + if (boot_cpu_has(X86_FEATURE_OSPKE)) + printk("%sPKRU: %08x\n", log_lvl, read_pkru()); +} + +void release_thread(struct task_struct *dead_task) +{ + WARN_ON(dead_task->mm); +} + +enum which_selector { + FS, + GS +}; + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } +} + +/* + * While a process is running,current->thread.fsbase and current->thread.gsbase + * may not match the corresponding CPU registers (see save_base_legacy()). + */ +void current_save_fsgs(void) +{ + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); + save_fsgs(current); + local_irq_restore(flags); +} +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); +#endif + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } +} + +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(!ldt || idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + WARN_ON_ONCE(task == current); + + task->thread.fsbase = fsbase; +} + +void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + WARN_ON_ONCE(task == current); + + task->thread.gsbase = gsbase; +} + +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) +{ + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + + loadsegment(fs, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); + load_gs_index(0); + + regs->ip = new_ip; + regs->sp = new_sp; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} +EXPORT_SYMBOL_GPL(start_thread); + +#ifdef CONFIG_COMPAT +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); +} +#endif + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); + + /* We must save %fs and %gs before load_TLS() because + * %fs and %gs may be cleared by load_TLS(). + * + * (e.g. xen_load_tls()) + */ + save_fsgs(prev_p); + + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ + load_TLS(next, cpu); + + /* + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them. + */ + arch_end_context_switch(next_p); + + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + x86_fsgsbase_load(prev, next); + + /* + * Switch the PDA and FPU contexts. + */ + this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + + switch_fpu_finish(next_p); + + /* Reload sp0. */ + update_task_stack(next_p); + + switch_to_extra(prev_p, next_p); + + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + + /* Load the Intel cache allocation PQR MSR. */ + resctrl_sched_in(next_p); + + return prev_p; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; + current_thread_info()->status &= ~TS_COMPAT; + + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit children are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +static void __set_personality_x32(void) +{ +#ifdef CONFIG_X86_X32 + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_32bit_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_32bit_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current_thread_info()->status &= ~TS_COMPAT; +#endif +} + +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current_thread_info()->status |= TS_COMPAT; +#endif +} + +void set_personality_ia32(bool x32) +{ + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_ADDR32); + + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); +} +EXPORT_SYMBOL_GPL(set_personality_ia32); + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) +{ + int ret; + + ret = map_vdso_once(image, addr); + if (ret) + return ret; + + return (long)image->size; +} +#endif + +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) +{ + int ret = 0; + + switch (option) { + case ARCH_SET_GS: { + if (unlikely(arg2 >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + /* + * ARCH_SET_GS has always overwritten the index + * and the base. Zero is the most sensible value + * to put in the index, and is the only value that + * makes any sense if FSGSBASE is unavailable. + */ + if (task == current) { + loadseg(GS, 0); + x86_gsbase_write_cpu_inactive(arg2); + + /* + * On non-FSGSBASE systems, save_base_legacy() expects + * that we also fill in thread.gsbase. + */ + task->thread.gsbase = arg2; + + } else { + task->thread.gsindex = 0; + x86_gsbase_write_task(task, arg2); + } + preempt_enable(); + break; + } + case ARCH_SET_FS: { + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(arg2 >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + /* + * Set the selector to 0 for the same reason + * as %gs above. + */ + if (task == current) { + loadseg(FS, 0); + x86_fsbase_write_cpu(arg2); + + /* + * On non-FSGSBASE systems, save_base_legacy() expects + * that we also fill in thread.fsbase. + */ + task->thread.fsbase = arg2; + } else { + task->thread.fsindex = 0; + x86_fsbase_write_task(task, arg2); + } + preempt_enable(); + break; + } + case ARCH_GET_FS: { + unsigned long base = x86_fsbase_read_task(task); + + ret = put_user(base, (unsigned long __user *)arg2); + break; + } + case ARCH_GET_GS: { + unsigned long base = x86_gsbase_read_task(task); + + ret = put_user(base, (unsigned long __user *)arg2); + break; + } + +#ifdef CONFIG_CHECKPOINT_RESTORE +# ifdef CONFIG_X86_X32_ABI + case ARCH_MAP_VDSO_X32: + return prctl_map_vdso(&vdso_image_x32, arg2); +# endif +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case ARCH_MAP_VDSO_32: + return prctl_map_vdso(&vdso_image_32, arg2); +# endif + case ARCH_MAP_VDSO_64: + return prctl_map_vdso(&vdso_image_64, arg2); +#endif + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} +#endif + +unsigned long KSTK_ESP(struct task_struct *task) +{ + return task_pt_regs(task)->sp; +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 000000000..bedca0114 --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1340 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tls.h" + +enum x86_regset { + REGSET_GENERAL, + REGSET_FP, + REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, + REGSET_TLS, + REGSET_IOPERM32, +}; + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + return ®s->bx + (regno >> 2); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + if (task == current) + retval = get_user_gs(task_pt_regs(task)); + else + retval = task_user_gs(task); + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + if (WARN_ON_ONCE(task == current)) + return -EIO; + + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + fallthrough; + + default: + *pt_regs_access(task_pt_regs(task), offset) = value; + break; + + case offsetof(struct user_regs_struct, gs): + task_user_gs(task) = value; + } + + return 0; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + if (WARN_ON_ONCE(task == current)) + return -EIO; + + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + /* + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. + */ + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + task->thread.fsindex = value; + break; + case offsetof(struct user_regs_struct,gs): + task->thread.gsindex = value; + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; + task_pt_regs(task)->cs = value; + break; + case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; + task_pt_regs(task)->ss = value; + break; + } + + return 0; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_MAX) + return -EIO; + x86_fsbase_write_task(child, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + if (value >= TASK_SIZE_MAX) + return -EIO; + x86_gsbase_write_task(child, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): + return x86_fsbase_read_task(task); + case offsetof(struct user_regs_struct, gs_base): + return x86_gsbase_read_task(task); +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int reg; + + for (reg = 0; to.left; reg++) + membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->virtual_dr6 |= (DR_TRAP0 << i); +} + +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) +{ + struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} + +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; + + return modify_user_hw_breakpoint(bp, &attr); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long old_dr7; + bool second_pass = false; + int i, rc, ret = 0; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + +restore: + rc = 0; + for (i = 0; i < HBP_NUM; i++) { + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; + } + + rc = ptrace_modify_breakpoint(bp, len, type, disabled); + if (rc) + break; + } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; + goto restore; + } + + return ret; +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long val = 0; + + if (n < HBP_NUM) { + int index = array_index_nospec(n, HBP_NUM); + struct perf_event *bp = thread->ptrace_bps[index]; + + if (bp) + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ + } else if (n == 7) { + val = thread->ptrace_dr7; + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct thread_struct *t = &tsk->thread; + struct perf_event *bp = t->ptrace_bps[nr]; + int err = 0; + + if (!bp) { + /* + * Put stub len and type to create an inactive but correct bp. + * + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) + err = PTR_ERR(bp); + else + t->ptrace_bps[nr] = bp; + } else { + struct perf_event_attr attr = bp->attr; + + attr.bp_addr = addr; + err = modify_user_hw_breakpoint(bp, &attr); + } + + return err; +} + +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) +{ + struct thread_struct *thread = &tsk->thread; + /* There are no DR4 or DR5 registers */ + int rc = -EIO; + + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + } else if (n == 6) { + thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ + rc = 0; + } else if (n == 7) { + rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } + return rc; +} + +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct io_bitmap *iobm = target->thread.io_bitmap; + + return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; +} + +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct io_bitmap *iobm = target->thread.io_bitmap; + + if (!iobm) + return -ENXIO; + + return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *)data); + break; + + case PTRACE_SET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *)data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl_64(child, data, addr); + break; +#endif + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include +#include +#include +#include + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + int ret; + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.orig_eax): + /* + * Warning: bizarre corner case fixup here. A 32-bit + * debugger setting orig_eax to -1 wants to disable + * syscall restart. Make sure that the syscall + * restart code sign-extends orig_ax. Also make sure + * we interpret the -ERESTART* codes correctly if + * loaded into regs->ax in case the task is not + * actually still sitting at the exit from a 32-bit + * syscall with TS_COMPAT still set. + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + child->thread_info.status |= TS_I386_REGS_POKED; + break; + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int reg; + + for (reg = 0; to.left; reg++) { + u32 val; + getreg32(target, reg * 4, &val); + membuf_store(&to, val); + } + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg32(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg32(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + +#ifdef CONFIG_COMPAT +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ +#ifdef CONFIG_X86_X32_ABI + if (!in_ia32_syscall()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif +#ifdef CONFIG_IA32_EMULATION + return ia32_arch_ptrace(child, request, caddr, cdata); +#else + return 0; +#endif +} +#endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_X86_64 + +static struct user_regset x86_64_regsets[] __ro_after_init = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .regset_get = genregs_get, .set = genregs_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .regset_get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .regset_get = ioperm_get + }, +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static struct user_regset x86_32_regsets[] __ro_after_init = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .regset_get = genregs32_get, .set = genregs32_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set + }, + [REGSET_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .regset_get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .regset_get = regset_tls_get, .set = regset_tls_set + }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .regset_get = ioperm_get + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (!user_64bit_mode(task_pt_regs(task))) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) +{ + struct task_struct *tsk = current; + + tsk->thread.trap_nr = X86_TRAP_DB; + tsk->thread.error_code = error_code; + + /* Send us the fake SIGTRAP */ + force_sig_fault(SIGTRAP, si_code, + user_mode(regs) ? (void __user *)regs->ip : NULL); +} + +void user_single_step_report(struct pt_regs *regs) +{ + send_sigtrap(regs, 0, TRAP_BRKPT); +} diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000..11065dc03 --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* paravirtual clock -- common code used by kvm/xen + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + +void pvclock_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + reset_hung_task_detector(); +} + +static atomic64_t last_value = ATOMIC64_INIT(0); + +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + u8 flags; + + do { + version = pvclock_read_begin(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); + + return flags & valid_flags; +} + +u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + u64 ret; + u64 last; + u8 flags; + + do { + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); + flags = src->flags; + } while (pvclock_read_retry(src, version)); + + if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + } + + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec64 *ts) +{ + u32 version; + u64 delta; + struct timespec64 now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + /* + * Note: wall_clock->sec is a u32 value, so it can + * only store dates between 1970 and 2106. To allow + * times beyond that, we need to create a new hypercall + * interface with an extended pvclock_wall_clock structure + * like ARM has. + */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec); +} + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c new file mode 100644 index 000000000..6d0df6a58 --- /dev/null +++ b/arch/x86/kernel/quirks.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include +#include +#include + +#include +#include +#include + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config; + u16 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + if (dev->revision > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); + + if (!(word & (1 << 13))) { + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); + noirqdebug_setup(""); +#ifdef CONFIG_PROC_FS + no_irq_affinity = 1; +#endif + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 rcba; + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap(rcba, 0x4000); + if (rcba_base == NULL) { + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); + +static struct pci_dev *cached_dev; + +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 gen_cntl; + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 gen_cntl; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, + vt8237_force_enable_hpet); + +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + int err = 0; + u32 d = 0; + u8 b = 0; + + err = pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + err |= pci_write_config_byte(dev, 0xac, b); + err |= pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + err |= pci_write_config_dword(dev, 0x70, d); + err |= pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + + WARN_ON_ONCE(err); + + return d; +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 d, val; + u8 b; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; + default: + break; + } +} + +/* + * According to the datasheet e6xx systems have the HPET hardwired to + * 0xfed00000 + */ +static void e6xx_force_enable_hpet(struct pci_dev *dev) +{ + if (hpet_address || force_hpet_address) + return; + + force_hpet_address = 0xFED00000; + force_hpet_resume_type = NONE_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, + e6xx_force_enable_hpet); + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + * See erratum #27 (Misinterpreted MSI Requests May Result in + * Corrupted LPC DMA Data) in AMD Publication #46837, + * "SB700 Family Product Errata", Rev. 1.0, March 2010. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = true; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 node; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + node = pcibus_to_node(dev->bus) | (val & 7); + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); + pci_dev_put(nb_ht); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + +#endif + +#ifdef CONFIG_PCI +/* + * Processor does not ensure DRAM scrub read/write sequence + * is atomic wrt accesses to CC6 save state area. Therefore + * if a concurrent scrub read/write access is to same address + * the entry may appear as if it is not written. This quirk + * applies to Fam16h models 00h-0Fh + * + * See "Revision Guide" for AMD F16h models 00h-0fh, + * document 51810 rev. 3.04, Nov 2013 + */ +static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) +{ + u32 val; + + /* + * Suggested workaround: + * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b + */ + pci_read_config_dword(dev, 0x58, &val); + if (val & 0x1F) { + val &= ~(0x1F); + pci_write_config_dword(dev, 0x58, val); + } + + pci_read_config_dword(dev, 0x5C, &val); + if (val & BIT(0)) { + val &= ~BIT(0); + pci_write_config_dword(dev, 0x5c, val); + } +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, + amd_disable_seq_and_redirect_scrub); + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + enable_copy_mc_fragile(); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0, capid5; + + pci_read_config_dword(pdev, 0x84, &capid0); + pci_read_config_dword(pdev, 0x98, &capid5); + + /* + * CAPID0{7:6} indicate whether this is an advanced RAS SKU + * CAPID5{8:5} indicate that various NVDIMM usage modes are + * enabled, so memory machine check recovery is also enabled. + */ + if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) + enable_copy_mc_fragile(); + +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c new file mode 100644 index 000000000..4d8c0e258 --- /dev/null +++ b/arch/x86/kernel/reboot.c @@ -0,0 +1,943 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +/* + * This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* + * Reboot options and system auto-detection code provided by + * Dell Inc. so their systems "just work". :-) + */ + +/* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + +/* + * Some machines require the "reboot=b" or "reboot=k" commandline options, + * this quirk makes that automatic. + */ +static int __init set_bios_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "BIOS"); + } + return 0; +} + +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + +void __noreturn machine_real_restart(unsigned int type) +{ + local_irq_disable(); + + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch to the trampoline page table. + */ + load_trampoline_pgtable(); + + /* Jump to the identity-mapped low memory code */ +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif +STACK_FRAME_NON_STANDARD(machine_real_restart); + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "PCI"); + } + return 0; +} + +static int __init set_kbd_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_KBD) { + reboot_type = BOOT_KBD; + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + d->ident, "KBD"); + } + return 0; +} + +/* + * This is a single dmi_table handling all reboot quirks. + */ +static const struct dmi_system_id reboot_dmi_table[] __initconst = { + + /* Acer */ + { /* Handle reboot issue on Acer Aspire one */ + .callback = set_kbd_reboot, + .ident = "Acer Aspire One A110", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), + }, + }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, + + /* Apple */ + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, + + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + + /* ASUS */ + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TA */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), + }, + }, + { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ + .callback = set_acpi_reboot, + .ident = "ASUS EeeBook X205TAW", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + }, + }, + + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + + /* Dell */ + { /* Handle problems with rebooting on Dell DXP061 */ + .callback = set_bios_reboot, + .ident = "Dell DXP061", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), + }, + }, + { /* Handle problems with rebooting on Dell E520's */ + .callback = set_bios_reboot, + .ident = "Dell E520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5410. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5410", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 330", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), + DMI_MATCH(DMI_BOARD_NAME, "0KP561"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0MM599"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0KW626"), + }, + }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990 BIOS A0x", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), + }, + }, + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + }, + }, + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 1300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + }, + }, + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + }, + }, + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell Precision M6600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, + { /* Handle problems with rebooting on Dell T7400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T7400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), + }, + }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, + + /* Hewlett-Packard */ + { /* Handle problems with rebooting on HP laptops */ + .callback = set_bios_reboot, + .ident = "HP Compaq Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + }, + }, + + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + + /* Sony */ + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + }, + }, + + { } +}; + +static int __init reboot_init(void) +{ + int rv; + + /* + * Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (!reboot_default) + return 0; + + /* + * The DMI quirks table takes precedence. If no quirks entry + * matches and the ACPI Hardware Reduced bit is set and EFI + * runtime services are enabled, force EFI reboot. + */ + rv = dmi_check_system(reboot_dmi_table); + + if (!rv && efi_reboot_required() && !efi_runtime_disabled()) + reboot_type = BOOT_EFI; + + return 0; +} +core_initcall(reboot_init); + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +static inline void nmi_shootdown_cpus_on_restart(void); + +static void emergency_reboot_disable_virtualization(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* + * Disable virtualization on all CPUs before rebooting to avoid hanging + * the system, as VMX and SVM block INIT when running in the host. + * + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. + * + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as + * other CPUs may have virtualization enabled. + */ + if (cpu_has_vmx() || cpu_has_svm(NULL)) { + /* Safely force _this_ CPU out of VMX/SVM operation. */ + cpu_emergency_disable_virtualization(); + + /* Disable VMX/SVM and halt on other CPUs. */ + nmi_shootdown_cpus_on_restart(); + } +} + + +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +/* + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. + * + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. + */ +static void native_machine_emergency_restart(void) +{ + int i; + int attempt = 0; + int orig_reboot_type = reboot_type; + unsigned short mode; + + if (reboot_emergency) + emergency_reboot_disable_virtualization(); + + tboot_shutdown(TB_SHUTDOWN_REBOOT); + + /* Tell the BIOS if we want cold or warm reboot */ + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; + + /* + * If an EFI capsule has been registered with the firmware then + * override the reboot= parameter. + */ + if (efi_capsule_pending(NULL)) { + pr_info("EFI capsule is pending, forcing EFI reboot.\n"); + reboot_type = BOOT_EFI; + } + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + case BOOT_KBD: + mach_reboot_fixups(); /* For board specific fixups */ + + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* Pulse reset low */ + udelay(50); + } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_EFI; + } + break; + + case BOOT_EFI: + efi_reboot(reboot_mode, NULL); + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; + break; + + case BOOT_CF9_FORCE: + port_cf9_safe = true; + fallthrough; + + case BOOT_CF9_SAFE: + if (port_cf9_safe) { + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; + u8 cf9 = inb(0xcf9) & ~reboot_code; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + /* Actually do the reset */ + outb(cf9|reboot_code, 0xcf9); + udelay(50); + } + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + idt_invalidate(NULL); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; + break; + } + } +} + +void native_machine_shutdown(void) +{ + /* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC + /* + * Disabling IO APIC before local APIC is a workaround for + * erratum AVR31 in "Intel Atom Processor C2000 Product Family + * Specification Update". In this situation, interrupts that target + * a Logical Processor whose Local APIC is either in the process of + * being hardware disabled or software disabled are neither delivered + * nor discarded. When this erratum occurs, the processor may hang. + * + * Even without the erratum, it still makes sense to quiet IO APIC + * before disabling Local APIC. + */ + clear_IO_APIC(); +#endif + +#ifdef CONFIG_SMP + /* + * Stop all of the others. Also disable the local irq to + * not receive the per-cpu timer interrupt which may trigger + * scheduler's load balance. + */ + local_irq_disable(); + stop_other_cpus(); +#endif + + lapic_shutdown(); + restore_boot_irq_mode(); + +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + +#ifdef CONFIG_X86_64 + x86_platform.iommu_shutdown(); +#endif +} + +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + +static void native_machine_restart(char *__unused) +{ + pr_notice("machine restart\n"); + + if (!reboot_force) + machine_shutdown(); + __machine_emergency_restart(0); +} + +static void native_machine_halt(void) +{ + /* Stop other cpus and apics */ + machine_shutdown(); + + tboot_shutdown(TB_SHUTDOWN_HALT); + + stop_this_cpu(NULL); +} + +static void native_machine_power_off(void) +{ + if (pm_power_off) { + if (!reboot_force) + machine_shutdown(); + pm_power_off(); + } + /* A fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); +} + +struct machine_ops machine_ops __ro_after_init = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +#ifdef CONFIG_KEXEC_CORE + .crash_shutdown = native_machine_crash_shutdown, +#endif +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + __machine_emergency_restart(1); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} + +#ifdef CONFIG_KEXEC_CORE +void machine_crash_shutdown(struct pt_regs *regs) +{ + machine_ops.crash_shutdown(regs); +} +#endif + + +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); +} + +#if defined(CONFIG_SMP) + +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; + +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu; + + cpu = raw_smp_processor_id(); + + /* + * Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NMI_HANDLED; + local_irq_disable(); + + if (shootdown_callback) + shootdown_callback(cpu, regs); + + /* + * Prepare the CPU for reboot _after_ invoking the callback so that the + * callback can safely use virtualization instructions, e.g. VMCLEAR. + */ + cpu_emergency_disable_virtualization(); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return NMI_HANDLED; +} + +/** + * nmi_shootdown_cpus - Stop other CPUs via NMI + * @callback: Optional callback to be invoked from the NMI handler + * + * The NMI handler on the remote CPUs invokes @callback, if not + * NULL, first and then disables virtualization to ensure that + * INIT is recognized during reboot. + * + * nmi_shootdown_cpus() can only be invoked once. After the first + * invocation all other CPUs are stuck in crash_nmi_callback() and + * cannot respond to a second NMI. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + + local_irq_disable(); + + /* + * Avoid certain doom if a shootdown already occurred; re-registering + * the NMI handler will cause list corruption, modifying the callback + * will do who knows what, etc... + */ + if (WARN_ON_ONCE(crash_ipi_issued)) + return; + + /* Make a note of crashing cpu. Will be used in NMI callback. */ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, + NMI_FLAG_FIRST, "crash")) + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + apic_send_IPI_allbutself(NMI_VECTOR); + + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* + * Leave the nmi callback set, shootdown is a one-time thing. Clearing + * the callback could result in a NULL pointer dereference if a CPU + * (finally) responds after the timeout expires. + */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) +{ + if (!crash_ipi_issued) + nmi_shootdown_cpus(NULL); +} + +/* + * Check if the crash dumping IPI got issued and if so, call its callback + * directly. This function is used when we have already been in NMI handler. + * It doesn't return. + */ +void run_crash_ipi_callback(struct pt_regs *regs) +{ + if (crash_ipi_issued) + crash_nmi_callback(0, regs); +} + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* If no CPU is preparing crash dump, we simply loop here. */ + run_crash_ipi_callback(regs); + cpu_relax(); + } +} + +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} + +static inline void nmi_shootdown_cpus_on_restart(void) { } + +void run_crash_ipi_callback(struct pt_regs *regs) +{ +} +#endif diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c new file mode 100644 index 000000000..b7c0f142d --- /dev/null +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This is a good place to put board specific reboot fixups. + * + * List of supported fixups: + * geode-gx1/cs5530a - Jaya Kumar + * geode-gx/lx/cs5536 - Andres Salomon + * + */ + +#include +#include +#include +#include +#include +#include + +static void cs5530a_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the reset control register, 0x44 causes the + cs5530a to perform a system warm reset */ + pci_write_config_byte(dev, 0x44, 0x1); + udelay(50); /* shouldn't get here but be safe and spin-a-while */ + return; +} + +static void cs5536_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the LSB of this MSR causes a hard reset */ + wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + udelay(50); /* shouldn't get here but be safe and spin a while */ +} + +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + +struct device_fixup { + unsigned int vendor; + unsigned int device; + void (*reboot_fixup)(struct pci_dev *); +}; + +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + +static const struct device_fixup fixups_table[] = { +{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, +{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, +}; + +/* + * we see if any fixup is available for our current hardware. if there + * is a fixup, we call it and we expect to never return from it. if we + * do return, we keep looking and then eventually fall back to the + * standard mach_reboot on return. + */ +void mach_reboot_fixups(void) +{ + const struct device_fixup *cur; + struct pci_dev *dev; + int i; + + /* we can be called from sysrq-B code. In such a case it is + * prohibited to dig PCI */ + if (in_interrupt()) + return; + + for (i=0; i < ARRAY_SIZE(fixups_table); i++) { + cur = &(fixups_table[i]); + dev = pci_get_device(cur->vendor, cur->device, NULL); + if (!dev) + continue; + + cur->reboot_fixup(dev); + pci_dev_put(dev); + } +} + diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S new file mode 100644 index 000000000..82a839885 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. + */ + +#define PTR(x) (x << 2) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + + .text +SYM_CODE_START_NOALIGN(relocate_kernel) + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) + + /* read the arguments and say goodbye to the stack */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi + + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(relocate_kernel) + +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + /* set return address to 0 if not preserving context */ + pushl $0 + /* store the start address on the stack */ + pushl %edx + + /* + * Set cr0 to a known state: + * - Paging disabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Proctected mode enabled + */ + movl %cr0, %eax + andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax + orl $(X86_CR0_PE), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* + * Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + xorl %eax, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* + * set all of the registers to known values + * leave %esp alone + */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ANNOTATE_UNRET_SAFE + ret + int3 +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + ANNOTATE_RETPOLINE_SAFE + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $X86_CR0_PG, %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(identity_mapped) + +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(virtual_mapped) + + /* Do the copies */ +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testb $0x1, %cl /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testb $0x2, %cl /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testb $0x4, %cl /* is it the done indicator */ + jz 2f + jmp 3f +2: + testb $0x8, %cl /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi + movl $1024, %ecx + rep ; movsl + + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl + + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl + + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(swap_pages) + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S new file mode 100644 index 000000000..d2b7d212c --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. + */ + +#define PTR(x) (x << 3) +#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + + .text + .align PAGE_SIZE + .code64 +SYM_CODE_START_NOALIGN(relocate_kernel) + UNWIND_HINT_EMPTY + /* + * %rdi indirection_page + * %rsi page_list + * %rdx start address + * %rcx preserve_context + * %r8 sme_active + */ + + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + + /* Save CR4. Required to enable the right paging mode later. */ + movq %rax, %r13 + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* Save SME active flag */ + movq %r8, %r12 + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + + /* Switch to the identity mapped page tables */ + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(relocate_kernel) + +SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + UNWIND_HINT_EMPTY + /* set return address to 0 if not preserving context */ + pushq $0 + /* store the start address on the stack */ + pushq %rdx + + /* + * Set cr0 to a known state: + * - Paging enabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Proctected mode enabled + */ + movq %cr0, %rax + andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movq %rax, %cr0 + + /* + * Set cr4 to a known state: + * - physical address extension enabled + * - 5-level paging, if it was enabled before + */ + movl $X86_CR4_PAE, %eax + testq $X86_CR4_LA57, %r13 + jz 1f + orl $X86_CR4_LA57, %eax +1: + movq %rax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + movq %r9, %cr3 + + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + ANNOTATE_UNRET_SAFE + ret + int3 + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + leaq relocate_kernel(%rip), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(identity_mapped) + +SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + UNWIND_HINT_EMPTY + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(virtual_mapped) + + /* Do the copies */ +SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + UNWIND_HINT_EMPTY + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorl %edi, %edi + xorl %esi, %esi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testb $0x1, %cl /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testb $0x2, %cl /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testb $0x4, %cl /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testb $0x8, %cl /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi + movl $512, %ecx + rep ; movsq + + movq %rax, %rdi + movq %rdx, %rsi + movl $512, %ecx + rep ; movsq + + movq %rdx, %rdi + movq %r10, %rsi + movl $512, %ecx + rep ; movsq + + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(swap_pages) + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 000000000..9b9fb7882 --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820_entry *entry; + + for (i = 0; i < e820_table->nr_entries; i++) { + entry = &e820_table->entries[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* + * Trim out BIOS area (high 2MB) and E820 regions. We do not remove + * the low 1MB unconditionally, as this area is needed for some ISA + * cards requiring a memory range, e.g. the i82365 PCMCIA controller. + */ + if (avail->flags & IORESOURCE_MEM) { + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 000000000..586f718b8 --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RTC related functions + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock; +EXPORT_SYMBOL(cmos_lock); +#endif /* CONFIG_X86_32 */ + +/* For two digit years assume time is always after that */ +#define CMOS_YEARS_OFFS 2000 + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + */ +int mach_set_rtc_mmss(const struct timespec64 *now) +{ + unsigned long long nowtime = now->tv_sec; + struct rtc_time tm; + int retval = 0; + + rtc_time64_to_tm(nowtime, &tm); + if (!rtc_valid_tm(&tm)) { + retval = mc146818_set_time(&tm); + if (retval) + printk(KERN_ERR "%s: RTC write failed with error %d\n", + __func__, retval); + } else { + printk(KERN_ERR + "%s: Invalid RTC value: write of %llx to RTC failed\n", + __func__, nowtime); + retval = -EINVAL; + } + return retval; +} + +void mach_get_cmos_time(struct timespec64 *now) +{ + unsigned int status, year, mon, day, hour, min, sec, century = 0; + unsigned long flags; + + /* + * If pm_trace abused the RTC as storage, set the timespec to 0, + * which tells the caller that this RTC value is unusable. + */ + if (!pm_trace_rtc_valid()) { + now->tv_sec = now->tv_nsec = 0; + return; + } + + spin_lock_irqsave(&rtc_lock, flags); + + /* + * If UIP is clear, then we have >= 244 microseconds before + * RTC registers will be updated. Spec sheet says that this + * is the reliable way to read RTC - registers. If UIP is set + * then the register access might be invalid. + */ + while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + acpi_gbl_FADT.century) + century = CMOS_READ(acpi_gbl_FADT.century); +#endif + + status = CMOS_READ(RTC_CONTROL); + WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); + + spin_unlock_irqrestore(&rtc_lock, flags); + + if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); + } + + if (century) { + century = bcd2bin(century); + year += century * 100; + } else + year += CMOS_YEARS_OFFS; + + now->tv_sec = mktime64(year, mon, day, hour, min, sec); + now->tv_nsec = 0; +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + val = inb(RTC_PORT(1)); + lock_cmos_suffix(addr); + + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + outb(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +int update_persistent_clock64(struct timespec64 now) +{ + return x86_platform.set_wallclock(&now); +} + +/* not static: needed by APM */ +void read_persistent_clock64(struct timespec64 *ts) +{ + x86_platform.get_wallclock(ts); +} + + +static struct resource rtc_resources[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device rtc_device = { + .name = "rtc_cmos", + .id = -1, + .resource = rtc_resources, + .num_resources = ARRAY_SIZE(rtc_resources), +}; + +static __init int add_rtc_cmos(void) +{ +#ifdef CONFIG_PNP + static const char * const ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + struct pnp_id *id; + int i; + + pnp_for_each_dev(dev) { + for (id = dev->id; id; id = id->next) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(id, ids[i]) != 0) + return 0; + } + } + } +#endif + if (!x86_platform.legacy.rtc) + return -ENODEV; + + platform_device_register(&rtc_device); + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); + + return 0; +} +device_initcall(add_rtc_cmos); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c new file mode 100644 index 000000000..e9b483c6f --- /dev/null +++ b/arch/x86/kernel/setup.c @@ -0,0 +1,1278 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1995 Linus Torvalds + * + * This file contains the setup_arch() code, which handles the architecture-dependent + * parts of early kernel initialization. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * max_low_pfn_mapped: highest directly mapped pfn < 4 GB + * max_pfn_mapped: highest directly mapped pfn > 4 GB + * + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are + * represented by pfn_mapped[]. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + +#ifdef CONFIG_DMI +RESERVE_BRK(dmi_alloc, 65536); +#endif + + +unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + +struct boot_params boot_params; + +/* + * These are the four main kernel memory regions, we put them into + * the resource tree so that kdump tools and other debugging tools + * recover it: + */ + +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + + +#ifdef CONFIG_X86_32 +/* CPU data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; + +/* Common CPU data for all CPUs */ +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); + +unsigned int def_to_bigsmp; + +/* For MCA, but anyone else can use it if they want */ +unsigned int machine_id; +unsigned int machine_submodel_id; +unsigned int BIOS_revision; + +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +struct ist_info ist_info; +EXPORT_SYMBOL(ist_info); +#else +struct ist_info ist_info; +#endif + +#else +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); +#endif + + +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +__visible unsigned long mmu_cr4_features __ro_after_init; +#else +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; +#endif + +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; + +/* + * Setup options + */ +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); + +extern int root_mountflags; + +unsigned long saved_video_mode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void __init copy_edd(void) +{ + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; +} +#else +static inline void __init copy_edd(void) +{ +} +#endif + +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +#ifdef CONFIG_X86_32 +static void __init cleanup_highmap(void) +{ +} +#endif + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + +u64 relocated_ramdisk; + +#ifdef CONFIG_BLK_DEV_INITRD + +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + + return ramdisk_size; +} + +static void __init relocate_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 area_size = PAGE_ALIGN(ramdisk_size); + + /* We need to move the initrd down into directly mapped mem */ + relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); + if (!relocated_ramdisk) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + + initrd_start = relocated_ramdisk + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); + + copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", + ramdisk_image, ramdisk_image + ramdisk_size - 1, + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); +} + +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} + +static void __init reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + initrd_start = 0; + + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); + + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + return; + } + + relocate_initrd(); + + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); +} + +#else +static void __init early_reserve_initrd(void) +{ +} +static void __init reserve_initrd(void) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + +static void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data, pa_next; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + u32 data_len, data_type; + + data = early_memremap(pa_data, sizeof(*data)); + data_len = data->len + sizeof(struct setup_data); + data_type = data->type; + pa_next = data->next; + early_memunmap(data, sizeof(*data)); + + switch (data_type) { + case SETUP_E820_EXT: + e820__memory_setup_extended(pa_data, data_len); + break; + case SETUP_DTB: + add_dtb(pa_data); + break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; + default: + break; + } + pa_data = pa_next; + } +} + +static void __init memblock_x86_reserve_range_setup_data(void) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 pa_data, pa_next; + u32 len; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + + memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); + } +} + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +#ifdef CONFIG_KEXEC_CORE + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +static int __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + unsigned long long base, low_base = 0, low_size = 0; + unsigned long low_mem_limit; + int ret; + + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); + + /* crashkernel=Y,low */ + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); + if (ret) { + /* + * two parts from kernel/dma/swiotlb.c: + * -swiotlb size: user-specified with swiotlb= or default. + * + * -swiotlb overflow buffer: now hardcoded to 32k. We round it + * to 8M for other buffers that may need to stay low too. Also + * make sure we allocate enough extra low memory so that we + * don't run out of DMA buffers for 32-bit devices. + */ + low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); + } else { + /* passed with crashkernel=0,low ? */ + if (!low_size) + return 0; + } + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", + (unsigned long)(low_size >> 20)); + return -ENOMEM; + } + + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(low_mem_limit >> 20)); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_size, crash_base, total_mem; + bool high = false; + int ret; + + total_mem = memblock_phys_mem_size(); + + /* crashkernel=XM */ + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) { + /* crashkernel=X,high */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } + + if (xen_pv_domain()) { + pr_info("Ignoring crashkernel for a Xen PV domain\n"); + return; + } + + /* 0 means: find the address automatically */ + if (!crash_base) { + /* + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * crashkernel=x,high reserves memory over 4G, also allocates + * 256M extra low memory for DMA buffers and swiotlb. + * But the extra memory is not required for all machines. + * So try low memory first and fall back to high memory + * unless "crashkernel=size[KMG],high" is specified. + */ + if (!high) + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_LOW_MAX); + if (!crash_base) + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX); + if (!crash_base) { + pr_info("crashkernel reservation failed - No suitable area found.\n"); + return; + } + } else { + unsigned long long start; + + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, + crash_base + crash_size); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); + return; + } + } + + if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { + memblock_free(crash_base, crash_size); + return; + } + + pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); +} +#else +static void __init reserve_crashkernel(void) +{ +} +#endif + +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + memblock_reserve(addr, size); +} + +static bool __init snb_gfx_workaround_needed(void) +{ +#ifdef CONFIG_PCI + int i; + u16 vendor, devid; + static const __initconst u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; +#endif + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const __initconst unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * Reserve all memory below the 1 MB mark that has not + * already been reserved. + */ + memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +/* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * + * If this gets used more widely it could use a real dispatch mechanism. + */ +static void __init trim_platform_memory_ranges(void) +{ + trim_snb_memory(); +} + +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + + /* + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. + * take them out. + */ + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); + + e820__update_table(e820_table); +} + +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); +} + +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); + +static void __init trim_low_memory_range(void) +{ + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +} + +/* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ + if (kaslr_enabled()) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + kaslr_offset(), + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } + + return 0; +} + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ + +void __init setup_arch(char **cmdline_p) +{ + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * Make sure page 0 is always reserved because on systems with + * L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, PAGE_SIZE); + + early_reserve_initrd(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + +#ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + /* + * Note: Quark X1000 CPUs advertise PGE incorrectly and require + * a cr3 based tlb flush, so the following __flush_tlb_all() + * will not flush anything because the CPU quirk which clears + * X86_FEATURE_PGE has not been invoked yet. Though due to the + * load_cr3() above the TLB has been flushed already. The + * quirk is invoked before subsequent calls to __flush_tlb_all() + * so proper operation is guaranteed. + */ + __flush_tlb_all(); +#else + printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; +#endif + + /* + * If we have OLPC OFW, we might end up relocating the fixmap due to + * reserve_top(), so do this before touching the ioremap area. + */ + olpc_ofw_detect(); + + idt_setup_early_traps(); + early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); + static_call_init(); + early_ioremap_init(); + + setup_olpc_ofw_pgd(); + + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + x86_init.oem.arch_setup(); + + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; + e820__memory_setup(); + parse_setup_data(); + + copy_edd(); + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = _brk_end; + + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + /* + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. + */ + x86_configure_nx(); + + parse_early_param(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + x86_report_nx(); + + /* after early param, so could get panic from serial */ + memblock_x86_reserve_range_setup_data(); + + if (acpi_mps_check()) { +#ifdef CONFIG_X86_LOCAL_APIC + disable_apic = 1; +#endif + setup_clear_cpu_cap(X86_FEATURE_APIC); + } + + e820__reserve_setup_data(); + e820__finish_early_params(); + + if (efi_enabled(EFI_BOOT)) + efi_init(); + + dmi_setup(); + + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_setup(), for the boot CPU. + */ + init_hypervisor_platform(); + + tsc_early_init(); + x86_init.resources.probe_roms(); + + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); + + e820_add_kernel_range(); + trim_bios_range(); +#ifdef CONFIG_X86_32 + if (ppro_with_ram_bug()) { + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820__print_table("bad_ppro"); + } +#else + early_gart_iommu_check(); +#endif + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + max_pfn = e820__end_of_ram_pfn(); + + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820__end_of_ram_pfn(); + + max_possible_pfn = max_pfn; + + /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + +#ifdef CONFIG_X86_32 + /* max_low_pfn get updated here */ + find_low_pfn_range(); +#else + check_x2apic(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ + if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) + max_low_pfn = e820__end_of_low_ram_pfn(); + else + max_low_pfn = max_pfn; + + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#endif + + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_ibft_region(); + + early_alloc_pgt_buf(); + + /* + * Need to conclude brk, before e820__memblock_setup() + * it could use memblock_find_in_range, could overlap with + * brk area. + */ + reserve_brk(); + + cleanup_highmap(); + + memblock_set_current_limit(ISA_END_ADDRESS); + e820__memblock_setup(); + + reserve_bios_regions(); + + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); + efi_mokvar_table_init(); + + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); + + /* preallocate 4k for mptable mpc */ + e820__memblock_alloc_reserved_mpc_new(); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + +#ifdef CONFIG_X86_32 + printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", + (max_pfn_mapped< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + +#ifdef CONFIG_X86_64 +#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +#else +#define BOOT_PERCPU_OFFSET 0 +#endif + +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { + [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, +}; +EXPORT_SYMBOL(__per_cpu_offset); + +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + +#ifdef CONFIG_X86_32 +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} +#endif + +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, + node); + + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif +} + +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +{ + return pcpu_alloc_bootmem(cpu, size, align); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + memblock_free(__pa(ptr), size); +} + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +#else + return LOCAL_DISTANCE; +#endif +} + +static void __init pcpup_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static inline void setup_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); +#endif +} + +void __init setup_per_cpu_areas(void) +{ + unsigned int cpu; + unsigned long delta; + int rc; + + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + + /* + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. + */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(cpu_number, cpu) = cpu; + setup_percpu_segment(cpu); + setup_stack_canary_segment(cpu); + /* + * Copy data used in early init routines from the + * initial arrays to the per cpu data areas. These + * arrays then become expendable and the *_early_ptr's + * are zeroed indicating that the static arrays are + * gone. + */ +#ifdef CONFIG_X86_LOCAL_APIC + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_bios_cpu_apicid, cpu) = + early_per_cpu_map(x86_bios_cpu_apicid, cpu); + per_cpu(x86_cpu_to_acpiid, cpu) = + early_per_cpu_map(x86_cpu_to_acpiid, cpu); +#endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); + /* + * Ensure that the boot cpu numa_node is correct when the boot + * cpu is on a node that doesn't have memory installed. + * Also cpu_up() will call cpu_to_node() for APs when + * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set + * up later with c_init aka intel_init/amd_init. + * So set them all (boot cpu and all APs). + */ + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); +#endif + /* + * Up to this point, the boot CPU has been using .init.data + * area. Reload any changed state for the boot CPU. + */ + if (!cpu) + switch_to_new_gdt(cpu); + } + + /* indicate the early static arrays will soon be gone */ +#ifdef CONFIG_X86_LOCAL_APIC + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; + early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; +#endif +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif +#ifdef CONFIG_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif + + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + + /* Setup cpu initialized, callin, callout masks */ + setup_cpu_local_masks(); + + /* + * Sync back kernel address range again. We already did this in + * setup_arch(), but percpu data also needs to be available in + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? + */ + sync_initial_page_table(); +} diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c new file mode 100644 index 000000000..e9f8a2bc5 --- /dev/null +++ b/arch/x86/kernel/sev-es-shared.c @@ -0,0 +1,567 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void sev_es_terminate(unsigned int reason) +{ + u64 val = GHCB_SEV_TERMINATE; + + /* + * Tell the hypervisor what went wrong - only reason-set 0 is + * currently supported. + */ + val |= GHCB_SEV_TERMINATE_REASON(0, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_INFO(val) != GHCB_SEV_INFO) + return false; + + if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + return false; + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + enum es_result ret; + + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v; + + info = ghcb->save.sw_exit_info_2; + v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + ret = ES_EXCEPTION; + } else { + ret = ES_VMM_ERROR; + } + } else if (ghcb->save.sw_exit_info_1 & 0xffffffff) { + ret = ES_VMM_ERROR; + } else { + ret = ES_OK; + } + + return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int fn = lower_bits(regs->ax, 32); + unsigned long val; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->ax = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->bx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->cx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->dx = val >> 32; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); + VMGEXIT(); + + /* Shouldn't get here - if we do halt the machine */ + while (true) + asm volatile("hlt\n"); +} + +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + port = ctxt->regs->dx & 0xffff; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + port = ctxt->regs->dx & 0xffff; + break; + + default: + return ES_DECODE_FAILED; + } + + *exitinfo |= port << 16; + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + size = 1; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; + } + + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return vc_ioio_check(ctxt, (u16)port, size); +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c new file mode 100644 index 000000000..b6f9fe0d6 --- /dev/null +++ b/arch/x86/kernel/sev-es.c @@ -0,0 +1,1507 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#define pr_fmt(fmt) "SEV-ES: " fmt + +#include /* For show_regs() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DR7_RESET_VALUE 0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC exception + * handler entry code. In this case, the IST entry for #VC must be adjusted, so + * that any subsequent #VC exception will not overwrite the stack contents of the + * interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested + * sev_es_ist_exit() call may adjust back the IST entry too early. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ + if (on_vc_stack(regs)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); + + /* Store old IST entry */ + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + int res; + + if (user_mode(ctxt->regs)) { + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; + } else { + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_get_length(&ctxt->insn); + } + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + + memcpy(&d1, buf, 1); + if (__put_user(d1, target)) + goto fault; + break; + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + + memcpy(&d2, buf, 2); + if (__put_user(d2, target)) + goto fault; + break; + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + + memcpy(&d4, buf, 4); + if (__put_user(d4, target)) + goto fault; + break; + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + + memcpy(&d8, buf, 8); + if (__put_user(d8, target)) + goto fault; + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + + if (__get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + + if (__get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + + if (__get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; + if (__get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-es-shared.c" + +static noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + __sev_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!sev_es_active()) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + return false; + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + __sev_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!sev_es_active()) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + int sign_byte; + long *reg_data; + + switch (insn->opcode.bytes[1]) { + /* MMIO Read w/ zero-extension */ + case 0xb6: + bytes = 1; + fallthrough; + case 0xb7: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memset(reg_data, 0, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MMIO Read w/ sign-extension */ + case 0xbe: + bytes = 1; + fallthrough; + case 0xbf: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Sign extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + memset(reg_data, sign_byte, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + long *reg_data; + + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + + switch (insn->opcode.bytes[0]) { + /* MMIO Write */ + case 0x88: + bytes = 1; + fallthrough; + case 0x89: + if (!bytes) + bytes = insn->opnd_bytes; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memcpy(ghcb->shared_buffer, reg_data, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + case 0xc6: + bytes = 1; + fallthrough; + case 0xc7: + if (!bytes) + bytes = insn->opnd_bytes; + + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + /* MMIO Read */ + case 0x8a: + bytes = 1; + fallthrough; + case 0x8b: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MOVS instruction */ + case 0xa4: + bytes = 1; + fallthrough; + case 0xa5: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + /* Two-Byte Opcodes */ + case 0x0f: + ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + break; + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ + unsigned long sp = (unsigned long)regs; + + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + bool ret = true; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + __sev_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + + return ret; +} + +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} + +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(on_vc_fallback_stack(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) +{ + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + /* Do initial setup or terminate the guest */ + if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + while (true) + halt(); +} diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000..3355e27c6 --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sev_verify_cbit.S - Code for verification of the C-bit position reported + * by the Hypervisor when running with SEV enabled. + * + * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* First check if a C-bit was detected */ + movq sme_me_mask(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ + movq sev_status(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* Save CR4 in %rsi */ + movq %cr4, %rsi + + /* Disable Global Pages */ + movq %rsi, %rdx + andq $(~X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* + * Verified that running under SEV - now get a random value using + * RDRAND. This instruction is mandatory when running as an SEV guest. + * + * Don't bail out of the loop if RDRAND returns errors. It is better to + * prevent forward progress than to work with a non-random value here. + */ +1: rdrand %rdx + jnc 1b + + /* Store value to memory and keep it in %rdx */ + movq %rdx, sev_check_data(%rip) + + /* Backup current %cr3 value to restore it later */ + movq %cr3, %rcx + + /* Switch to new %cr3 - This might unmap the stack */ + movq %rdi, %cr3 + + /* + * Compare value in %rdx with memory location. If C-bit is incorrect + * this would read the encrypted data and make the check fail. + */ + cmpq %rdx, sev_check_data(%rip) + + /* Restore old %cr3 */ + movq %rcx, %cr3 + + /* Restore previous CR4 */ + movq %rsi, %cr4 + + /* Check CMPQ result */ + je 3f + + /* + * The check failed, prevent any forward progress to prevent ROP + * attacks, invalidate the stack and go into a hlt loop. + */ + xorq %rsp, %rsp + subq $0x1000, %rsp +2: hlt + jmp 2b +3: +#endif + /* Return page-table pointer */ + movq %rdi, %rax + RET +SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c new file mode 100644 index 000000000..9eff48171 --- /dev/null +++ b/arch/x86/kernel/signal.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#endif /* CONFIG_X86_64 */ + +#include +#include +#include + +#ifdef CONFIG_X86_64 +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} +# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) +#else +# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) +#endif + +static int restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) +{ + struct sigcontext sc; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) + return -EFAULT; + +#ifdef CONFIG_X86_32 + set_user_gs(regs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; +#endif /* CONFIG_X86_32 */ + + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; + +#ifdef CONFIG_X86_64 + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; +#endif /* CONFIG_X86_64 */ + + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; + + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + + return fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); +} + +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ +#ifdef CONFIG_X86_32 + unsafe_put_user(get_user_gs(regs), + (unsigned int __user *)&sc->gs, Efault); + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); +#endif /* CONFIG_X86_32 */ + + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); +#ifdef CONFIG_X86_64 + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); +#endif /* CONFIG_X86_64 */ + + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); +#ifdef CONFIG_X86_32 + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); +#else /* !CONFIG_X86_32 */ + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); +#endif /* CONFIG_X86_32 */ + + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); + + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; +} + +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + +/* + * Set up a signal frame. + */ + +/* + * Determine which stack to use.. + */ +static unsigned long align_sigframe(unsigned long sp) +{ +#ifdef CONFIG_X86_32 + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -16ul) - 4; +#else /* !CONFIG_X86_32 */ + sp = round_down(sp, 16) - 8; +#endif + return sp; +} + +static void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void __user **fpstate) +{ + /* Default to using normal stack */ + bool nested_altstack = on_sig_stack(regs->sp); + bool entering_altstack = false; + unsigned long math_size = 0; + unsigned long sp = regs->sp; + unsigned long buf_fx = 0; + int ret; + + /* redzone */ + if (IS_ENABLED(CONFIG_X86_64)) + sp -= 128; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + /* + * This checks nested_altstack via sas_ss_flags(). Sensible + * programs use SS_AUTODISARM, which disables that check, and + * programs that don't use SS_AUTODISARM get compatible. + */ + if (sas_ss_flags(sp) == 0) { + sp = current->sas_ss_sp + current->sas_ss_size; + entering_altstack = true; + } + } else if (IS_ENABLED(CONFIG_X86_32) && + !nested_altstack && + regs->ss != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; + entering_altstack = true; + } + + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), + &buf_fx, &math_size); + *fpstate = (void __user *)sp; + + sp = align_sigframe(sp - frame_size); + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (unlikely((nested_altstack || entering_altstack) && + !__on_sig_stack(sp))) { + + if (show_unhandled_signals && printk_ratelimit()) + pr_info("%s[%d] overflowed sigaltstack\n", + current->comm, task_pid_nr(current)); + + return (void __user *)-1L; + } + + /* save i387 and extended state */ + ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); + if (ret < 0) + return (void __user *)-1L; + + return (void __user *)sp; +} + +#ifdef CONFIG_X86_32 +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + +static int +__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + struct sigframe __user *frame; + void __user *restorer; + void __user *fp = NULL; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); + if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_sigreturn; + else + restorer = &frame->retcode; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + + /* Set up to return from userspace. */ + unsafe_put_user(restorer, &frame->pretcode, Efault); + + /* + * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); + user_access_end(); + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = 0; + regs->cx = 0; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} + +static int __setup_rt_frame(int sig, struct ksignal *ksig, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + void __user *fp = NULL; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(&frame->info, &frame->pinfo, Efault); + unsafe_put_user(&frame->uc, &frame->puc, Efault); + + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. */ + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, &frame->pretcode, Efault); + + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = (unsigned long)&frame->info; + regs->cx = (unsigned long)&frame->uc; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +Efault: + user_access_end(); + return -EFAULT; +} +#else /* !CONFIG_X86_32 */ +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + +static int __setup_rt_frame(int sig, struct ksignal *ksig, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *fp = NULL; + unsigned long uc_flags; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->di = sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatbility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ + regs->cs = __USER_CS; + + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + + return 0; + +Efault: + user_access_end(); + return -EFAULT; +} +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + +static int x32_setup_rt_frame(struct ksignal *ksig, + compat_sigset_t *set, + struct pt_regs *regs) +{ +#ifdef CONFIG_X86_X32_ABI + struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; + void __user *restorer; + void __user *fp = NULL; + + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); + + uc_flags = frame_uc_flags(regs); + + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; + + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; +#endif /* CONFIG_X86_X32_ABI */ + + return 0; +#ifdef CONFIG_X86_X32_ABI +Efault: + user_access_end(); + return -EFAULT; +#endif +} + +/* + * Do a signal return; undo the signal stack. + */ +#ifdef CONFIG_X86_32 +SYSCALL_DEFINE0(sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct sigframe __user *frame; + sigset_t set; + + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || + __get_user(set.sig[1], &frame->extramask[0])) + goto badframe; + + set_current_blocked(&set); + + /* + * x86_32 has no uc_flags bits relevant to restore_sigcontext. + * Save a few cycles by skipping the __get_user. + */ + if (restore_sigcontext(regs, &frame->sc, 0)) + goto badframe; + return regs->ax; + +badframe: + signal_fault(regs, frame, "sigreturn"); + + return 0; +} +#endif /* CONFIG_X86_32 */ + +SYSCALL_DEFINE0(rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +static inline int is_ia32_compat_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; +} + +static inline int is_ia32_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); +} + +static inline int is_x32_frame(struct ksignal *ksig) +{ + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; +} + +static int +setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + int usig = ksig->sig; + sigset_t *set = sigmask_to_save(); + compat_sigset_t *cset = (compat_sigset_t *) set; + + /* Perform fixup for the pre-signal frame. */ + rseq_signal_deliver(ksig, regs); + + /* Set up the stack frame */ + if (is_ia32_frame(ksig)) { + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + return ia32_setup_rt_frame(usig, ksig, cset, regs); + else + return ia32_setup_frame(usig, ksig, cset, regs); + } else if (is_x32_frame(ksig)) { + return x32_setup_rt_frame(ksig, cset, regs); + } else { + return __setup_rt_frame(ksig->sig, ksig, set, regs); + } +} + +static void +handle_signal(struct ksignal *ksig, struct pt_regs *regs) +{ + bool stepping, failed; + struct fpu *fpu = ¤t->thread.fpu; + + if (v8086_mode(regs)) + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); + + /* Are we from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* If so, check system call restarting.. */ + switch (syscall_get_error(current, regs)) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->ax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { + regs->ax = -EINTR; + break; + } + fallthrough; + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + } + } + + /* + * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now + * so that register information in the sigcontext is correct and + * then notify the tracer before entering the signal handler. + */ + stepping = test_thread_flag(TIF_SINGLESTEP); + if (stepping) + user_disable_single_step(current); + + failed = (setup_rt_frame(ksig, regs) < 0); + if (!failed) { + /* + * Clear the direction flag as per the ABI for function entry. + * + * Clear RF when entering the signal handler, because + * it might disable possible debug exception from the + * signal handler. + * + * Clear TF for the case when it wasn't set by debugger to + * avoid the recursive send_sigtrap() in SIGTRAP handler. + */ + regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); + /* + * Ensure the signal handler starts with the new fpu state. + */ + fpu__clear_user_states(fpu); + } + signal_setup_done(failed, ksig, stepping); +} + +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & TS_COMPAT_RESTART) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else + return __NR_restart_syscall; +#endif +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) +{ + struct ksignal ksig; + + if (has_signal && get_signal(&ksig)) { + /* Whee! Actually deliver the signal. */ + handle_signal(&ksig, regs); + return; + } + + /* Did we come from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* Restart the system call - no handlers present */ + switch (syscall_get_error(current, regs)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + + case -ERESTART_RESTARTBLOCK: + regs->ax = get_nr_restart_syscall(regs); + regs->ip -= 2; + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask + * back. + */ + restore_saved_sigmask(); +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s" + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig(SIGSEGV); +} + +#ifdef CONFIG_X86_X32_ABI +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long uc_flags; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) + goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; + + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c new file mode 100644 index 000000000..a7f3e12cf --- /dev/null +++ b/arch/x86/kernel/signal_compat.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 11); + BUILD_BUG_ON(NSIGFPE != 15); + BUILD_BUG_ON(NSIGSEGV != 9); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 5); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8); + + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8); + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 3*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C); + +#ifdef CONFIG_X86_X32_ABI + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20); +#endif + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); + + BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14); + + /* any new si_fields should be added here */ +} + +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + signal_compat_build_tests(); + + /* Don't leak in-kernel non-uapi flags to user-space */ + if (oact) + oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (!act) + return; + + /* Don't let flags to be set from userspace */ + act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c new file mode 100644 index 000000000..95758ae12 --- /dev/null +++ b/arch/x86/kernel/smp.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998-99, 2000, 2009 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically that's so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + cpu_emergency_disable_virtualization(); + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) +{ + ack_APIC_irq(); + cpu_emergency_disable_virtualization(); + stop_this_cpu(NULL); +} + +static int register_stop_handler(void) +{ + return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop"); +} + +static void native_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + + /* + * We start by using the REBOOT_VECTOR irq. + * The irq is treated as a sync point to allow critical + * regions of code on other cpus to release their spin locks + * and re-enable irqs. Jumping straight to an NMI might + * accidentally cause deadlocks with further shutdown/panic + * code. By syncing, we give the cpus up to one second to + * finish their work before we force them off with the NMI. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) + return; + + /* sync above data before sending IRQ */ + wmb(); + + apic_send_IPI_allbutself(REBOOT_VECTOR); + + /* + * Don't wait longer than a second for IPI completion. The + * wait request is not checked here because that would + * prevent an NMI shutdown attempt in case that not all + * CPUs reach shutdown state. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && timeout--) + udelay(1); + } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if (num_online_cpus() > 1) { + /* + * If NMI IPI is enabled, try to register the stop handler + * and send the IPI. In any case try to wait for the other + * CPUs to stop. + */ + if (!smp_no_nmi_ipi && !register_stop_handler()) { + /* Sync above data before sending IRQ */ + wmb(); + + pr_emerg("Shutting down cpus with NMI\n"); + + apic_send_IPI_allbutself(NMI_VECTOR); + } + /* + * Don't wait longer than 10 ms if the caller didn't + * reqeust it. If wait is true, the machine hangs here if + * one or more CPUs do not reach shutdown state. + */ + timeout = USEC_PER_MSEC * 10; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + local_irq_restore(flags); +} + +/* + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode. + */ +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) +{ + ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + inc_irq_stat(irq_resched_count); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) +{ + ack_APIC_irq(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); + inc_irq_stat(irq_call_count); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) +{ + ack_APIC_irq(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); +} + +static int __init nonmi_ipi_setup(char *str) +{ + smp_no_nmi_ipi = true; + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, + + .stop_other_cpus = native_stop_other_cpus, +#if defined(CONFIG_KEXEC_CORE) + .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, +#endif + .smp_send_reschedule = native_smp_send_reschedule, + + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, +}; +EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c new file mode 100644 index 000000000..d2403da17 --- /dev/null +++ b/arch/x86/kernel/smpboot.c @@ -0,0 +1,2122 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + /* + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * Copyright 2001 Andi Kleen, SuSE Labs. + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIPS report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX APICs + * Andi Kleen : Changed for SMP boot into long mode. + * Martin J. Bligh : Added support for multi-quad systems + * Dave Jones : Report invalid combinations of Athlon CPUs. + * Rusty Russell : Hacked into shape for new "hotplug" boot process. + * Andi Kleen : Converted to new state machine. + * Ashok Raj : CPU hotplug support + * Glauber Costa : i386 and x86_64 integration + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* representing HT siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); + +/* representing HT and core siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); + +/* representing HT, core, and die siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); +EXPORT_PER_CPU_SYMBOL(cpu_die_map); + +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Per CPU bogomips and other parameters */ +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + +struct mwait_cpu_dead { + unsigned int control; + unsigned int status; +}; + +/* + * Cache line aligned data for mwait_play_dead(). Separate on purpose so + * that it's unlikely to be touched by other CPUs. + */ +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); + +/* Logical package management. We might want to allocate that dynamically */ +unsigned int __max_logical_packages __read_mostly; +EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static unsigned int logical_die __read_mostly; + +/* Maximum number of SMT threads on any online core */ +int __read_mostly __max_smt_threads = 1; + +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + +static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) +{ + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0xa, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; +} + +static inline void smpboot_restore_warm_reset_vector(void) +{ + unsigned long flags; + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); + + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; +} + +static void init_freq_invariance(bool secondary); + +/* + * Report back to the Boot Processor during boot time or to the caller processor + * during CPU online. + */ +static void smp_callin(void) +{ + int cpuid; + + /* + * If waken up by an INIT in an 82489DX configuration + * cpu_callout_mask guarantees we don't get here before + * an INIT_deassert IPI reaches our local APIC, so it is + * now safe to touch our local APIC. + */ + cpuid = smp_processor_id(); + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + apic_ap_setup(); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + + /* + * The topology information must be up to date before + * calibrate_delay() and notify_cpu_starting(). + */ + set_cpu_sibling_map(raw_smp_processor_id()); + + init_freq_invariance(true); + + /* + * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. + */ + calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; + pr_debug("Stack at about %p\n", &cpuid); + + wmb(); + + notify_cpu_starting(cpuid); + + /* + * Allow the master to continue. + */ + cpumask_set_cpu(cpuid, cpu_callin_mask); +} + +static int cpu0_logical_apicid; +static int enable_start_cpu0; +/* + * Activate a secondary processor. + */ +static void notrace start_secondary(void *unused) +{ + /* + * Don't put *anything* except direct CPU state initialization + * before cpu_init(), SMP booting is too fragile that we want to + * limit the things done here to the most necessary things. + */ + cr4_init(); + +#ifdef CONFIG_X86_32 + /* switch away from the initial page table */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + cpu_init_secondary(); + rcu_cpu_starting(raw_smp_processor_id()); + x86_cpuinit.early_percpu_clock_init(); + smp_callin(); + + enable_start_cpu0 = 0; + + /* otherwise gcc will move up smp_processor_id before the cpu_init */ + barrier(); + /* + * Check TSC synchronization with the boot CPU: + */ + check_tsc_sync_target(); + + speculative_store_bypass_ht_init(); + + /* + * Lock vector_lock, set CPU online and bring the vector + * allocator online. Online must be set with vector_lock held + * to prevent a concurrent irq setup/teardown from seeing a + * half valid vector space. + */ + lock_vector_lock(); + set_cpu_online(smp_processor_id(), true); + lapic_online(); + unlock_vector_lock(); + cpu_set_state_online(smp_processor_id()); + x86_platform.nmi_init(); + + /* enable local interrupts */ + local_irq_enable(); + + x86_cpuinit.setup_percpu_clockev(); + + wmb(); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); +} + +/** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +bool topology_is_primary_thread(unsigned int cpu) +{ + return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); +} + +/** + * topology_smt_supported - Check whether SMT is supported by the CPUs + */ +bool topology_smt_supported(void) +{ + return smp_num_siblings > 1; +} + +/** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->phys_proc_id == phys_pkg) + return c->logical_proc_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); +/** + * topology_phys_to_logical_die - Map a physical die id to logical + * + * Returns logical die id or -1 if not found + */ +int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +{ + int cpu; + int proc_id = cpu_data(cur_cpu).phys_proc_id; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->cpu_die_id == die_id && + c->phys_proc_id == proc_id) + return c->logical_die_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_die); + +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_pkg(pkg); + if (new >= 0) + goto found; + + new = logical_packages++; + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } +found: + cpu_data(cpu).logical_proc_id = new; + return 0; +} +/** + * topology_update_die_map - Update the physical to logical die map + * @die: The die id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_die_map(unsigned int die, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_die(die, cpu); + if (new >= 0) + goto found; + + new = logical_die++; + if (new != die) { + pr_info("CPU %u Converting physical %u to logical die %u\n", + cpu, die, new); + } +found: + cpu_data(cpu).logical_die_id = new; + return 0; +} + +void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; + topology_update_package_map(c->phys_proc_id, id); + topology_update_die_map(c->cpu_die_id, id); + c->initialized = true; +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ +void smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c = &cpu_data(id); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = id; + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up AP or offlined CPU0. + */ + identify_secondary_cpu(c); + c->initialized = true; +} + +static bool +topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); +} + +static bool +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(!topology_same_node(c, o), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); +} + +#define link_mask(mfunc, c1, c2) \ +do { \ + cpumask_set_cpu((c1), mfunc(c2)); \ + cpumask_set_cpu((c2), mfunc(c1)); \ +} while (0) + +static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { + if (c->cpu_core_id == o->cpu_core_id) + return topology_sane(c, o, "smt"); + + if ((c->cu_id != 0xff) && + (o->cu_id != 0xff) && + (c->cu_id == o->cu_id)) + return topology_sane(c, o, "smt"); + } + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id) + return true; + return false; +} + +/* + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return true; + return false; +} + +/* + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. + * + * Any Intel CPU that has multiple nodes per package and does not + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. + * + * When in SNC mode, these CPUs enumerate an LLC that is shared + * by multiple NUMA nodes. The LLC is shared for off-package data + * access but private to the NUMA node (half of the package) for + * on-package access. CPUID (the source of the information about + * the LLC) can only enumerate the cache as shared or unshared, + * but not this particular configuration. + */ + +static const struct x86_cpu_id intel_cod_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ + {} +}; + +static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + bool intel_snc = id && id->driver_data; + + /* Do not match if we do not have a valid APICID for cpu: */ + if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) + return false; + + /* Do not match if LLC id does not match: */ + if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2)) + return false; + + /* + * Allow the SNC topology without warning. Return of false + * means 'c' does not share the LLC of 'o'. This will be + * reflected to userspace. + */ + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) + return false; + + return topology_sane(c, o, "llc"); +} + + +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags() | x86_sched_itmt_flags(); +} +#endif +#endif + +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; + +static struct sched_domain_topology_level x86_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +/* + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours, Intel Cluster-on-Die, and Intel + * Sub-NUMA Clustering have this. + */ +static bool x86_has_numa_in_package; + +void set_cpu_sibling_map(int cpu) +{ + bool has_smt = smp_num_siblings > 1; + bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i, threads; + + cpumask_set_cpu(cpu, cpu_sibling_setup_mask); + + if (!has_mp) { + cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); + c->booted_cores = 1; + return; + } + + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if (match_pkg(c, o) && !topology_same_node(c, o)) + x86_has_numa_in_package = true; + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(topology_sibling_cpumask, cpu, i); + + if ((i == cpu) || (has_mp && match_llc(c, o))) + link_mask(cpu_llc_shared_mask, cpu, i); + + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); + } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; + + /* + * This needs a separate iteration over the cpus because we rely on all + * topology_sibling_cpumask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if ((i == cpu) || (has_mp && match_pkg(c, o))) { + link_mask(topology_core_cpumask, cpu, i); + + /* + * Does this new cpu bringup a new core? + */ + if (threads == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (cpumask_first( + topology_sibling_cpumask(i)) == i) + c->booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + cpu_data(i).booted_cores++; + } else if (i != cpu && !c->booted_cores) + c->booted_cores = cpu_data(i).booted_cores; + } + } +} + +/* maps the cpu to the sched domain representing multi-core */ +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return cpu_llc_shared_mask(cpu); +} + +static void impress_friends(void) +{ + int cpu; + unsigned long bogosum = 0; + /* + * Allow the user to impress friends. + */ + pr_debug("Before bogomips\n"); + for_each_possible_cpu(cpu) + if (cpumask_test_cpu(cpu, cpu_callout_mask)) + bogosum += cpu_data(cpu).loops_per_jiffy; + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", + num_online_cpus(), + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + + pr_debug("Before bogocount - setting activated=1\n"); +} + +void __inquire_remote_apic(int apicid) +{ + unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + const char * const names[] = { "ID", "VERSION", "SPIV" }; + int timeout; + u32 status; + + pr_info("Inquiring remote APIC 0x%x...\n", apicid); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + pr_info("... APIC 0x%x %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + status = safe_apic_wait_icr_idle(); + if (status) + pr_cont("a previous APIC delivery may have failed\n"); + + apic_icr_write(APIC_DM_REMRD | regs[i], apicid); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + pr_cont("%08x\n", status); + break; + default: + pr_cont("failed\n"); + } + } +} + +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UINT_MAX; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UINT_MAX) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { + init_udelay = 0; + return; + } + /* else, use legacy delay */ + init_udelay = UDELAY_10MS_DEFAULT; +} + +/* + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this + * won't ... remember to clear down the APIC, etc later. + */ +int +wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) +{ + unsigned long send_status, accept_status = 0; + int maxlvt; + + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ + apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } + pr_debug("NMI sent\n"); + + if (send_status) + pr_err("APIC never delivered???\n"); + if (accept_status) + pr_err("APIC delivery error (%lx)\n", accept_status); + + return (send_status | accept_status); +} + +static int +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +{ + unsigned long send_status = 0, accept_status = 0; + int maxlvt, num_starts, j; + + maxlvt = lapic_get_maxlvt(); + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + pr_debug("Asserting INIT\n"); + + /* + * Turn INIT on target chip + */ + /* + * Send IPI + */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, + phys_apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + udelay(init_udelay); + + pr_debug("Deasserting INIT\n"); + + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + mb(); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + pr_debug("#startup loops: %d\n", num_starts); + + for (j = 1; j <= num_starts; j++) { + pr_debug("Sending STARTUP #%d\n", j); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + pr_debug("After apic_write\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); + + /* + * Give the other CPU some time to accept the IPI. + */ + if (init_udelay == 0) + udelay(10); + else + udelay(300); + + pr_debug("Startup point 1\n"); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + if (init_udelay == 0) + udelay(10); + else + udelay(200); + + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + pr_debug("After Startup\n"); + + if (send_status) + pr_err("APIC never delivered???\n"); + if (accept_status) + pr_err("APIC delivery error (%lx)\n", accept_status); + + return (send_status | accept_status); +} + +/* reduce the number of lines printed when booting a large cpu count system */ +static void announce_cpu(int cpu, int apicid) +{ + static int current_node = NUMA_NO_NODE; + int node = early_cpu_to_node(cpu); + static int width, node_width; + + if (!width) + width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + + if (!node_width) + node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + + if (cpu == 1) + printk(KERN_INFO "x86: Booting SMP configuration:\n"); + + if (system_state < SYSTEM_RUNNING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont("\n"); + current_node = node; + + printk(KERN_INFO ".... node %*s#%d, CPUs: ", + node_width - num_digits(node), " ", node); + } + + /* Add padding for the BSP */ + if (cpu == 1) + pr_cont("%*s", width + 1, " "); + + pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); + + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + +static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) + return NMI_HANDLED; + + return NMI_DONE; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + * + * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS + * boot-strap code which is not a desired behavior for waking up BSP. To + * void the boot-strap code, wake up CPU0 by NMI instead. + * + * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined + * (i.e. physically hot removed and then hot added), NMI won't wake it up. + * We'll change this code in the future to wake up hard offlined CPU0 if + * real platform and request are available. + */ +static int +wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, + int *cpu0_nmi_registered) +{ + int id; + int boot_error; + + preempt_disable(); + + /* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ + if (cpu) { + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + goto out; + } + + /* + * Wake up BSP by nmi. + * + * Register a NMI handler to help wake up CPU0. + */ + boot_error = register_nmi_handler(NMI_LOCAL, + wakeup_cpu0_nmi, 0, "wake_cpu0"); + + if (!boot_error) { + enable_start_cpu0 = 1; + *cpu0_nmi_registered = 1; + if (apic->dest_logical == APIC_DEST_LOGICAL) + id = cpu0_logical_apicid; + else + id = apicid; + boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); + } + +out: + preempt_enable(); + + return boot_error; +} + +int common_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + int ret; + + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + per_cpu(current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); + + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); +#else + initial_gs = per_cpu_offset(cpu); +#endif + return 0; +} + +/* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if CPU booted OK, else error code from + * ->wakeup_secondary_cpu. + */ +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + int *cpu0_nmi_registered) +{ + /* start_ip had better be page-aligned! */ + unsigned long start_ip = real_mode_header->trampoline_start; + + unsigned long boot_error = 0; + unsigned long timeout; + + idle->thread.sp = (unsigned long)task_pt_regs(idle); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_code = (unsigned long)start_secondary; + initial_stack = idle->thread.sp; + + /* Enable the espfix hack for this CPU */ + init_espfix_ap(cpu); + + /* So we see what's up */ + announce_cpu(cpu, apicid); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + if (x86_platform.legacy.warm_reset) { + + pr_debug("Setting warm reset code and vector.\n"); + + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(boot_cpu_apic_version)) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + } + + /* + * AP might wait on cpu_callout_mask in cpu_init() with + * cpu_initialized_mask set if previous attempt to online + * it timed-out. Clear cpu_initialized_mask so that after + * INIT/SIPI it could start with a clean state. + */ + cpumask_clear_cpu(cpu, cpu_initialized_mask); + smp_mb(); + + /* + * Wake up a CPU in difference cases: + * - Use the method in the APIC driver if it's defined + * Otherwise, + * - Use an INIT boot APIC message for APs or NMI for BSP. + */ + if (apic->wakeup_secondary_cpu) + boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); + else + boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, + cpu0_nmi_registered); + + if (!boot_error) { + /* + * Wait 10s total for first sign of life from AP + */ + boot_error = -1; + timeout = jiffies + 10*HZ; + while (time_before(jiffies, timeout)) { + if (cpumask_test_cpu(cpu, cpu_initialized_mask)) { + /* + * Tell AP to proceed with initialization + */ + cpumask_set_cpu(cpu, cpu_callout_mask); + boot_error = 0; + break; + } + schedule(); + } + } + + if (!boot_error) { + /* + * Wait till AP completes initial initialization + */ + while (!cpumask_test_cpu(cpu, cpu_callin_mask)) { + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); + } + } + + if (x86_platform.legacy.warm_reset) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } + + return boot_error; +} + +int native_cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + int apicid = apic->cpu_present_to_apicid(cpu); + int cpu0_nmi_registered = 0; + unsigned long flags; + int err, ret = 0; + + lockdep_assert_irqs_enabled(); + + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || + !physid_isset(apicid, phys_cpu_present_map) || + !apic->apic_id_valid(apicid)) { + pr_err("%s: bad cpu %d\n", __func__, cpu); + return -EINVAL; + } + + /* + * Already booted CPU? + */ + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + pr_debug("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + /* x86 CPUs take themselves offline, so delayed offline is OK. */ + err = cpu_check_up_prepare(cpu); + if (err && err != -EBUSY) + return err; + + /* the FPU context is blank, nobody can own it */ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + + err = common_cpu_up(cpu, tidle); + if (err) + return err; + + err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); + if (err) { + pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); + ret = -EIO; + goto unreg_nmi; + } + + /* + * Check TSC synchronization with the AP (keep irqs disabled + * while doing so): + */ + local_irq_save(flags); + check_tsc_sync_source(cpu); + local_irq_restore(flags); + + while (!cpu_online(cpu)) { + cpu_relax(); + touch_nmi_watchdog(); + } + +unreg_nmi: + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + + return ret; +} + +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + +/* + * Fall back to non SMP mode after errors. + * + * RED-PEN audit/test this more. I bet there is more state messed up here. + */ +static __init void disable_smp(void) +{ + pr_info("SMP disabled\n"); + + disable_ioapic_support(); + + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + if (smp_found_config) + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + else + physid_set_mask_of_physid(0, &phys_cpu_present_map); + cpumask_set_cpu(0, topology_sibling_cpumask(0)); + cpumask_set_cpu(0, topology_core_cpumask(0)); + cpumask_set_cpu(0, topology_die_cpumask(0)); +} + +/* + * Various sanity checks. + */ +static void __init smp_sanity_check(void) +{ + preempt_disable(); + +#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); + + nr = 0; + for_each_present_cpu(cpu) { + if (nr >= 8) + set_cpu_present(cpu, false); + nr++; + } + + nr = 0; + for_each_possible_cpu(cpu) { + if (nr >= 8) + set_cpu_possible(cpu, false); + nr++; + } + + nr_cpu_ids = 8; + } +#endif + + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", + hard_smp_processor_id()); + + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + */ + if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + preempt_enable(); +} + +static void __init smp_cpu_index_default(void) +{ + int i; + struct cpuinfo_x86 *c; + + for_each_possible_cpu(i) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = nr_cpu_ids; + } +} + +static void __init smp_get_logical_apicid(void) +{ + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + +/* + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned int i; + + smp_cpu_index_default(); + + /* + * Setup boot CPU information + */ + smp_store_boot_cpu_info(); /* Final full version of the data */ + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + + set_cpu_sibling_map(0); + init_freq_invariance(false); + smp_sanity_check(); + + switch (apic_intr_mode) { + case APIC_PIC: + case APIC_VIRTUAL_WIRE_NO_CONFIG: + disable_smp(); + return; + case APIC_SYMMETRIC_IO_NO_ROUTING: + disable_smp(); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + return; + case APIC_VIRTUAL_WIRE: + case APIC_SYMMETRIC_IO: + break; + } + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); + + smp_get_logical_apicid(); + + pr_info("CPU0: "); + print_cpu_info(&cpu_data(0)); + + uv_system_init(); + + set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); +} + +void arch_thaw_secondary_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_thaw_secondary_cpus_end(void) +{ + mtrr_aps_init(); +} + +/* + * Early setup to make printk work. + */ +void __init native_smp_prepare_boot_cpu(void) +{ + int me = smp_processor_id(); + switch_to_new_gdt(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ + cpumask_set_cpu(me, cpu_callout_mask); + cpu_set_state_online(me); + native_pv_lock_init(); +} + +void __init calculate_max_logical_packages(void) +{ + int ncpus; + + /* + * Today neither Intel nor AMD support heterogenous systems so + * extrapolate the boot cpu's data to all packages. + */ + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); + __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + pr_info("Max logical packages: %u\n", __max_logical_packages); +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + calculate_max_logical_packages(); + + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + + nmi_selftest(); + impress_friends(); + mtrr_aps_init(); +} + +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + +/* + * cpu_possible_mask should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and don't expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with possible_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK + */ +__init void prefill_possible_map(void) +{ + int i, possible; + + /* No boot processor was found in mptable or ACPI MADT */ + if (!num_processors) { + if (boot_cpu_has(X86_FEATURE_APIC)) { + int apicid = boot_cpu_physical_apicid; + int cpu = hard_smp_processor_id(); + + pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); + + /* Make sure boot cpu is enumerated */ + if (apic->cpu_present_to_apicid(0) == BAD_APICID && + apic->apic_id_valid(apicid)) + generic_processor_info(apicid, boot_cpu_apic_version); + } + + if (!num_processors) + num_processors = 1; + } + + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else + possible = setup_possible_cpus; + + total_cpus = max_t(int, possible, num_processors + disabled_cpus); + + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { + pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", + possible, nr_cpu_ids); + possible = nr_cpu_ids; + } + +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + pr_warn("%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + + nr_cpu_ids = possible; + + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", + possible, max_t(int, possible - num_processors, 0)); + + reset_cpu_possible_mask(); + + for (i = 0; i < possible; i++) + set_cpu_possible(i, true); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu(sibling, topology_core_cpumask(cpu)) { + cpumask_clear_cpu(cpu, topology_core_cpumask(sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu(sibling, topology_die_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(topology_sibling_cpumask(cpu)); + cpumask_clear(topology_core_cpumask(cpu)); + cpumask_clear(topology_die_cpumask(cpu)); + c->cpu_core_id = 0; + c->booted_cores = 0; + cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); +} + +static void remove_cpu_from_maps(int cpu) +{ + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, cpu_callout_mask); + cpumask_clear_cpu(cpu, cpu_callin_mask); + /* was set by cpu_init() */ + cpumask_clear_cpu(cpu, cpu_initialized_mask); + numa_remove_cpu(cpu); +} + +void cpu_disable_common(void) +{ + int cpu = smp_processor_id(); + + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(); + lapic_offline(); +} + +int native_cpu_disable(void) +{ + int ret; + + ret = lapic_can_unplug_cpu(); + if (ret) + return ret; + + cpu_disable_common(); + + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + * + * Disabling the APIC must happen after cpu_disable_common() + * which invokes fixup_irqs(). + * + * Disabling the APIC preserves already set bits in IRR, but + * an interrupt arriving after disabling the local APIC does not + * set the corresponding IRR bit. + * + * fixup_irqs() scans IRR for set bits so it can raise a not + * yet handled interrupt on the new destination CPU via an IPI + * but obviously it can't do so for IRR bits which are not set. + * IOW, interrupts arriving after disabling the local APIC will + * be lost. + */ + apic_soft_disable(); + + return 0; +} + +int common_cpu_die(unsigned int cpu) +{ + int ret = 0; + + /* We don't do anything here: idle task is faking death itself. */ + + /* They ack this in play_dead() by setting CPU_DEAD */ + if (cpu_wait_death(cpu, 5)) { + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + } else { + pr_err("CPU %u didn't die...\n", cpu); + ret = -1; + } + + return ret; +} + +void native_cpu_die(unsigned int cpu) +{ + common_cpu_die(cpu); +} + +void play_dead_common(void) +{ + idle_task_exit(); + + /* Ack it */ + (void)cpu_report_death(); + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) +{ + if (smp_processor_id() == 0 && enable_start_cpu0) + start_cpu0(); +} +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); + +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return; + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + mb(); + clflush(md); + mb(); + __monitor(md, 0, 0); + mb(); + __mwait(eax, 0); + + cond_wakeup_cpu0(); + } +} + +void hlt_play_dead(void) +{ + if (__this_cpu_read(cpu_info.x86) >= 4) + wbinvd(); + + while (1) { + native_halt(); + + cond_wakeup_cpu0(); + } +} + +void native_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + + mwait_play_dead(); /* Only returns on failure */ + if (cpuidle_play_dead()) + hlt_play_dead(); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ +int native_cpu_disable(void) +{ + return -ENOSYS; +} + +void native_cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} + +void native_play_dead(void) +{ + BUG(); +} + +#endif + +#ifdef CONFIG_X86_64 +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) +{ + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#include +#include + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: + /* + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. + */ + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } + + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; +} + +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +static void init_freq_invariance(bool secondary) +{ + bool ret = false; + + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (secondary) { + if (static_branch_likely(&arch_scale_freq_key)) { + init_counter_refs(); + } + return; + } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + + if (ret) { + init_counter_refs(); + static_branch_enable(&arch_scale_freq_key); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale = SCHED_CAPACITY_SCALE; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; + + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void init_freq_invariance(bool secondary) +{ +} +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c new file mode 100644 index 000000000..8627fda8d --- /dev/null +++ b/arch/x86/kernel/stacktrace.c @@ -0,0 +1,136 @@ +/* + * Stack trace management functions + * + * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar + */ +#include +#include +#include +#include +#include +#include +#include +#include + +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) +{ + struct unwind_state state; + unsigned long addr; + + if (regs && !consume_entry(cookie, regs->ip)) + return; + + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || !consume_entry(cookie, addr)) + break; + } +} + +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) +{ + struct unwind_state state; + struct pt_regs *regs; + unsigned long addr; + + for (unwind_start(&state, task, NULL, NULL); + !unwind_done(&state) && !unwind_error(&state); + unwind_next_frame(&state)) { + + regs = unwind_get_entry_regs(&state, NULL); + if (regs) { + /* Success path for user tasks */ + if (user_mode(regs)) + return 0; + + /* + * Kernel mode registers on the stack indicate an + * in-kernel interrupt or exception (e.g., preemption + * or a page fault), which can make frame pointers + * unreliable. + */ + if (IS_ENABLED(CONFIG_FRAME_POINTER)) + return -EINVAL; + } + + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know + * about. + */ + if (!addr) + return -EINVAL; + + if (!consume_entry(cookie, addr)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) + return -EINVAL; + + return 0; +} + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame_user { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int +copy_stack_frame(const struct stack_frame_user __user *fp, + struct stack_frame_user *frame) +{ + int ret; + + if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE)) + return 0; + + ret = 1; + pagefault_disable(); + if (__get_user(frame->next_fp, &fp->next_fp) || + __get_user(frame->ret_addr, &fp->ret_addr)) + ret = 0; + pagefault_enable(); + + return ret; +} + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + const void __user *fp = (const void __user *)regs->bp; + + if (!consume_entry(cookie, regs->ip)) + return; + + while (1) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (!frame.ret_addr) + break; + if (!consume_entry(cookie, frame.ret_addr)) + break; + fp = frame.next_fp; + } +} + diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c new file mode 100644 index 000000000..759b986b7 --- /dev/null +++ b/arch/x86/kernel/static_call.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +enum insn_type { + CALL = 0, /* site call */ + NOP = 1, /* site cond-call */ + JMP = 2, /* tramp / site tail-call */ + RET = 3, /* tramp / site cond-tail-call */ +}; + +/* + * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such + * that there is no false-positive trampoline identification while also being a + * speculation stop. + */ +static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; + +static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; + +static void __ref __static_call_transform(void *insn, enum insn_type type, + void *func, bool modinit) +{ + int size = CALL_INSN_SIZE; + const void *code; + + switch (type) { + case CALL: + code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + break; + + case NOP: + code = ideal_nops[NOP_ATOMIC5]; + break; + + case JMP: + code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); + break; + + case RET: + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + else + code = &retinsn; + break; + } + + if (memcmp(insn, code, size) == 0) + return; + + if (system_state == SYSTEM_BOOTING || modinit) + return text_poke_early(insn, code, size); + + text_poke_bp(insn, code, size, NULL); +} + +static void __static_call_validate(void *insn, bool tail) +{ + u8 opcode = *(u8 *)insn; + + if (tail) { + if (opcode == JMP32_INSN_OPCODE || + opcode == RET_INSN_OPCODE) + return; + } else { + if (opcode == CALL_INSN_OPCODE || + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + return; + } + + /* + * If we ever trigger this, our text is corrupt, we'll probably not live long. + */ + WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); +} + +static inline enum insn_type __sc_insn(bool null, bool tail) +{ + /* + * Encode the following table without branches: + * + * tail null insn + * -----+-------+------ + * 0 | 0 | CALL + * 0 | 1 | NOP + * 1 | 0 | JMP + * 1 | 1 | RET + */ + return 2*tail + null; +} + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + mutex_lock(&text_mutex); + + if (tramp) { + __static_call_validate(tramp, true); + __static_call_transform(tramp, __sc_insn(!func, true), func, false); + } + + if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { + __static_call_validate(site, tail); + __static_call_transform(site, __sc_insn(!func, tail), func, false); + } + + mutex_unlock(&text_mutex); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); + +#ifdef CONFIG_RETHUNK +/* + * This is called by apply_returns() to fix up static call trampolines, + * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as + * having a return trampoline. + * + * The problem is that static_call() is available before determining + * X86_FEATURE_RETHUNK and, by implication, running alternatives. + * + * This means that __static_call_transform() above can have overwritten the + * return trampoline and we now need to fix things up to be consistent. + */ +bool __static_call_fixup(void *tramp, u8 op, void *dest) +{ + unsigned long addr = (unsigned long)tramp; + /* + * Not all .return_sites are a static_call trampoline (most are not). + * Check if the 3 bytes after the return are still kernel text, if not, + * then this definitely is not a trampoline and we need not worry + * further. + * + * This avoids the memcmp() below tripping over pagefaults etc.. + */ + if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) && + !kernel_text_address(addr + 7)) + return false; + + if (memcmp(tramp+5, tramp_ud, 3)) { + /* Not a trampoline site, not our problem. */ + return false; + } + + mutex_lock(&text_mutex); + if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) + __static_call_transform(tramp, RET, NULL, true); + mutex_unlock(&text_mutex); + + return true; +} +#endif diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 000000000..2f97d1a10 --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include +#include +#include +#include +#include +#include + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc; + unsigned long base; + + seg >>= 3; + + mutex_lock(&child->mm->context.lock); + if (unlikely(!child->mm->context.ldt || + seg >= child->mm->context.ldt->nr_entries)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = &child->mm->context.ldt->entries[seg]; + base = get_desc_base(desc); + + /* 16-bit code segment? */ + if (!desc->d) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } +#endif + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (!user_64bit_mode(regs)) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + oflags = regs->flags; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. + */ + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); + return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +void set_task_blockstep(struct task_struct *task, bool on) +{ + unsigned long debugctl; + + /* + * Ensure irq/preemption can't change debugctl in between. + * Note also that both TIF_BLOCKSTEP and debugctl should + * be changed atomically wrt preemption. + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). + */ + local_irq_disable(); + debugctl = get_debugctlmsr(); + if (on) { + debugctl |= DEBUGCTLMSR_BTF; + set_tsk_thread_flag(task, TIF_BLOCKSTEP); + } else { + debugctl &= ~DEBUGCTLMSR_BTF; + clear_tsk_thread_flag(task, TIF_BLOCKSTEP); + } + if (task == current) + update_debugctlmsr(debugctl); + local_irq_enable(); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So no one should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) + set_task_blockstep(child, true); + else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c new file mode 100644 index 000000000..6cf65397d --- /dev/null +++ b/arch/x86/kernel/sys_ia32.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger + * Copyright (C) 1999 Arun Sharma + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define AA(__x) ((unsigned long)(__x)) + +SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_truncate(filename, + ((loff_t) offset_high << 32) | offset_low); +} + +SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* warning: next two assume little endian */ +SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) +{ + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) +{ + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) +{ + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} + +#ifdef CONFIG_IA32_EMULATION +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); + if (!user_write_access_begin(ubuf, sizeof(struct stat64))) + return -EFAULT; + unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault); + unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault); + unsafe_put_user(stat->ino, &ubuf->st_ino, Efault); + unsafe_put_user(stat->mode, &ubuf->st_mode, Efault); + unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault); + unsafe_put_user(uid, &ubuf->st_uid, Efault); + unsafe_put_user(gid, &ubuf->st_gid, Efault); + unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault); + unsafe_put_user(stat->size, &ubuf->st_size, Efault); + unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault); + unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault); + unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault); + unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault); + unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault); + unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault); + unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault); + unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault); + user_access_end(); + return 0; +Efault: + user_write_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct32 { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg) +{ + struct mmap_arg_struct32 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset>>PAGE_SHIFT); +} + +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) +{ + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return kernel_clone(&args); +} +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c new file mode 100644 index 000000000..3fd1c81eb --- /dev/null +++ b/arch/x86/kernel/sys_x86_64.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + */ +static unsigned long get_align_mask(void) +{ + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return 0; + + if (!(current->flags & PF_RANDOMIZE)) + return 0; + + return va_align.mask; +} + +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); + + return 1; +} +__setup("align_va_addr=", control_va_addr_alignment); + +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) +{ + long error; + error = -EINVAL; + if (off & ~PAGE_MASK) + goto out; + + error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); +out: + return error; +} + +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) +{ + if (!in_32bit_syscall() && (flags & MAP_32BIT)) { + /* This is usually used needed to map code in small + model, so it needs to be in the first 31bit. Limit + it to that. This means we need to move the + unmapped base down for this case. This can give + conflicts with the heap, but we assume that glibc + malloc knows how to fall back to mmap. Give it 1GB + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + *begin = randomize_page(*begin, 0x02000000); + } + return; + } + + *begin = get_mmap_base(1); + if (in_32bit_syscall()) + *end = task_size_32bit(); + else + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); +} + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + unsigned long begin, end; + + if (flags & MAP_FIXED) + return addr; + + find_start_end(addr, flags, &begin, &end); + + if (len > end) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (end - len >= addr && + (!vma || addr + len <= vm_start_gap(vma))) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } + return vm_unmapped_area(&info); +} + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + /* No address checking. See comment at mmap_address_hint_valid() */ + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legacy mmap base */ + if (!in_32bit_syscall() && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr &= PAGE_MASK; + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + + vma = find_vma(mm, addr); + if (!vma || addr + len <= vm_start_gap(vma)) + return addr; + } +get_unmapped_area: + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_32bit_syscall() check to avoid high addresses for x32 + * (and make it no op on native i386). + */ + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); +} diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 000000000..014ebd8ca --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return PTR_ERR_OR_ZERO(pd); +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 000000000..fff04d285 --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include