summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig128
-rw-r--r--arch/x86/Kconfig.cpu8
-rw-r--r--arch/x86/Makefile22
-rw-r--r--arch/x86/Makefile.postlink3
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/acpi.c16
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S29
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c12
-rw-r--r--arch/x86/boot/compressed/idt_64.c1
-rw-r--r--arch/x86/boot/compressed/idt_handlers_64.S1
-rw-r--r--arch/x86/boot/compressed/kaslr.c26
-rw-r--r--arch/x86/boot/compressed/mem.c8
-rw-r--r--arch/x86/boot/compressed/misc.c40
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c9
-rw-r--r--arch/x86/boot/compressed/sev.c7
-rw-r--r--arch/x86/boot/compressed/tdx.c6
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/header.S211
-rw-r--r--arch/x86/boot/pm.c7
-rw-r--r--arch/x86/boot/setup.ld14
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/tools/build.c273
-rw-r--r--arch/x86/coco/core.c42
-rw-r--r--arch/x86/coco/tdx/tdcall.S235
-rw-r--r--arch/x86/coco/tdx/tdx-shared.c34
-rw-r--r--arch/x86/coco/tdx/tdx.c163
-rw-r--r--arch/x86/configs/hardening.config14
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Kconfig8
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c52
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S1
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c9
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c9
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c7
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S1
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c39
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S1
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S52
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S55
-rw-r--r--arch/x86/crypto/sm4-avx.h4
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c26
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c130
-rw-r--r--arch/x86/entry/calling.h12
-rw-r--r--arch/x86/entry/common.c121
-rw-r--r--arch/x86/entry/entry.S25
-rw-r--r--arch/x86/entry/entry_32.S5
-rw-r--r--arch/x86/entry/entry_64.S189
-rw-r--r--arch/x86/entry/entry_64_compat.S28
-rw-r--r--arch/x86/entry/syscall_32.c21
-rw-r--r--arch/x86/entry/syscall_64.c19
-rw-r--r--arch/x86/entry/syscall_x32.c10
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl11
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl10
-rw-r--r--arch/x86/entry/thunk_32.S2
-rw-r--r--arch/x86/entry/thunk_64.S2
-rw-r--r--arch/x86/entry/vdso/Makefile38
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c10
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/entry/vdso/vsgx.S1
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c28
-rw-r--r--arch/x86/events/amd/brs.c2
-rw-r--r--arch/x86/events/amd/core.c25
-rw-r--r--arch/x86/events/amd/ibs.c3
-rw-r--r--arch/x86/events/amd/lbr.c6
-rw-r--r--arch/x86/events/amd/uncore.c1060
-rw-r--r--arch/x86/events/core.c11
-rw-r--r--arch/x86/events/intel/core.c622
-rw-r--r--arch/x86/events/intel/cstate.c161
-rw-r--r--arch/x86/events/intel/ds.c16
-rw-r--r--arch/x86/events/intel/lbr.c85
-rw-r--r--arch/x86/events/intel/pt.c8
-rw-r--r--arch/x86/events/intel/uncore.c14
-rw-r--r--arch/x86/events/intel/uncore.h10
-rw-r--r--arch/x86/events/intel/uncore_discovery.c5
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c214
-rw-r--r--arch/x86/events/perf_event.h49
-rw-r--r--arch/x86/events/perf_event_flags.h2
-rw-r--r--arch/x86/events/rapl.c22
-rw-r--r--arch/x86/hyperv/hv_apic.c2
-rw-r--r--arch/x86/hyperv/hv_vtl.c28
-rw-r--r--arch/x86/hyperv/irqdomain.c2
-rw-r--r--arch/x86/hyperv/ivm.c79
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative.h30
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h49
-rw-r--r--arch/x86/include/asm/apicdef.h276
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/asm.h14
-rw-r--r--arch/x86/include/asm/barrier.h21
-rw-r--r--arch/x86/include/asm/bitops.h20
-rw-r--r--arch/x86/include/asm/boot.h3
-rw-r--r--arch/x86/include/asm/cacheinfo.h3
-rw-r--r--arch/x86/include/asm/cfi.h126
-rw-r--r--arch/x86/include/asm/cmpxchg.h6
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/coco.h14
-rw-r--r--arch/x86/include/asm/cpu.h24
-rw-r--r--arch/x86/include/asm/cpufeature.h10
-rw-r--r--arch/x86/include/asm/cpufeatures.h32
-rw-r--r--arch/x86/include/asm/crash_core.h44
-rw-r--r--arch/x86/include/asm/current.h1
-rw-r--r--arch/x86/include/asm/debugreg.h1
-rw-r--r--arch/x86/include/asm/desc_defs.h78
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/elf.h3
-rw-r--r--arch/x86/include/asm/entry-common.h1
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h2
-rw-r--r--arch/x86/include/asm/fb.h10
-rw-r--r--arch/x86/include/asm/fpu/types.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h6
-rw-r--r--arch/x86/include/asm/ia32.h11
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/io.h10
-rw-r--r--arch/x86/include/asm/iosf_mbi.h2
-rw-r--r--arch/x86/include/asm/irq_work.h1
-rw-r--r--arch/x86/include/asm/jump_label.h6
-rw-r--r--arch/x86/include/asm/kmsan.h17
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h101
-rw-r--r--arch/x86/include/asm/local.h33
-rw-r--r--arch/x86/include/asm/mce.h6
-rw-r--r--arch/x86/include/asm/mem_encrypt.h27
-rw-r--r--arch/x86/include/asm/microcode.h21
-rw-r--r--arch/x86/include/asm/mpspec.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h4
-rw-r--r--arch/x86/include/asm/msr-index.h42
-rw-r--r--arch/x86/include/asm/mwait.h9
-rw-r--r--arch/x86/include/asm/nospec-branch.h132
-rw-r--r--arch/x86/include/asm/page.h6
-rw-r--r--arch/x86/include/asm/paravirt.h83
-rw-r--r--arch/x86/include/asm/paravirt_types.h88
-rw-r--r--arch/x86/include/asm/percpu.h112
-rw-r--r--arch/x86/include/asm/perf_event.h13
-rw-r--r--arch/x86/include/asm/pgtable.h15
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/preempt.h5
-rw-r--r--arch/x86/include/asm/processor.h82
-rw-r--r--arch/x86/include/asm/prom.h5
-rw-r--r--arch/x86/include/asm/proto.h3
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h4
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/rmwcc.h2
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/setup.h3
-rw-r--r--arch/x86/include/asm/sev.h14
-rw-r--r--arch/x86/include/asm/shared/tdx.h96
-rw-r--r--arch/x86/include/asm/smp.h4
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/spec-ctrl.h11
-rw-r--r--arch/x86/include/asm/special_insns.h11
-rw-r--r--arch/x86/include/asm/suspend_32.h10
-rw-r--r--arch/x86/include/asm/svm.h6
-rw-r--r--arch/x86/include/asm/syscall.h17
-rw-r--r--arch/x86/include/asm/tdx.h51
-rw-r--r--arch/x86/include/asm/text-patching.h12
-rw-r--r--arch/x86/include/asm/topology.h12
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/uaccess.h10
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h4
-rw-r--r--arch/x86/include/asm/vsyscall.h10
-rw-r--r--arch/x86/include/asm/x86_init.h5
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h2
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h111
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/include/uapi/asm/signal.h1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c38
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/alternative.c223
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c9
-rw-r--r--arch/x86/kernel/aperture_64.c3
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c42
-rw-r--r--arch/x86/kernel/apic/apic_common.c4
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c12
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c18
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/ipi.c13
-rw-r--r--arch/x86/kernel/apic/local.h7
-rw-r--r--arch/x86/kernel/apic/probe_32.c13
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c9
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c33
-rw-r--r--arch/x86/kernel/callthunks.c22
-rw-r--r--arch/x86/kernel/cfi.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c342
-rw-r--r--arch/x86/kernel/cpu/bugs.c326
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c49
-rw-r--r--arch/x86/kernel/cpu/common.c293
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c6
-rw-r--r--arch/x86/kernel/cpu/debugfs.c58
-rw-r--r--arch/x86/kernel/cpu/hygon.c45
-rw-r--r--arch/x86/kernel/cpu/intel.c192
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c148
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c4
-rw-r--r--arch/x86/kernel/cpu/mce/core.c127
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c324
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h70
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c115
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c190
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c689
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c679
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h49
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c14
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c21
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c14
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h39
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c48
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c295
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c13
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c14
-rw-r--r--arch/x86/kernel/crash.c16
-rw-r--r--arch/x86/kernel/devicetree.c6
-rw-r--r--arch/x86/kernel/e820.c8
-rw-r--r--arch/x86/kernel/early-quirks.c4
-rw-r--r--arch/x86/kernel/eisa.c3
-rw-r--r--arch/x86/kernel/fpu/bugs.c1
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/fpu/signal.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.c6
-rw-r--r--arch/x86/kernel/fpu/xstate.h14
-rw-r--r--arch/x86/kernel/ftrace_32.S2
-rw-r--r--arch/x86/kernel/ftrace_64.S2
-rw-r--r--arch/x86/kernel/head32.c120
-rw-r--r--arch/x86/kernel/head64.c21
-rw-r--r--arch/x86/kernel/head_32.S12
-rw-r--r--arch/x86/kernel/head_64.S44
-rw-r--r--arch/x86/kernel/hpet.c8
-rw-r--r--arch/x86/kernel/idt.c7
-rw-r--r--arch/x86/kernel/irqflags.S2
-rw-r--r--arch/x86/kernel/itmt.c1
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c23
-rw-r--r--arch/x86/kernel/kprobes/core.c11
-rw-r--r--arch/x86/kernel/kvm.c15
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c7
-rw-r--r--arch/x86/kernel/module.c20
-rw-r--r--arch/x86/kernel/nmi.c14
-rw-r--r--arch/x86/kernel/paravirt.c54
-rw-r--r--arch/x86/kernel/probe_roms.c10
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/setup.c164
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/sev-shared.c37
-rw-r--r--arch/x86/kernel/sev.c47
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smpboot.c114
-rw-r--r--arch/x86/kernel/topology.c33
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/unwind_orc.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S20
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig59
-rw-r--r--arch/x86/kvm/Makefile16
-rw-r--r--arch/x86/kvm/cpuid.c46
-rw-r--r--arch/x86/kvm/cpuid.h26
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/emulate.c27
-rw-r--r--arch/x86/kvm/governed_features.h1
-rw-r--r--arch/x86/kvm/hyperv.c52
-rw-r--r--arch/x86/kvm/hyperv.h88
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq_comm.c9
-rw-r--r--arch/x86/kvm/kvm_emulate.h9
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h20
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/mmu.h15
-rw-r--r--arch/x86/kvm/mmu/mmu.c399
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c116
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h3
-rw-r--r--arch/x86/kvm/mtrr.c2
-rw-r--r--arch/x86/kvm/pmu.c109
-rw-r--r--arch/x86/kvm/pmu.h29
-rw-r--r--arch/x86/kvm/reverse_cpuid.h38
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/hyperv.h9
-rw-r--r--arch/x86/kvm/svm/nested.c34
-rw-r--r--arch/x86/kvm/svm/pmu.c1
-rw-r--r--arch/x86/kvm/svm/sev.c94
-rw-r--r--arch/x86/kvm/svm/svm.c81
-rw-r--r--arch/x86/kvm/svm/svm.h4
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c10
-rw-r--r--arch/x86/kvm/svm/svm_ops.h6
-rw-r--r--arch/x86/kvm/svm/vmenter.S10
-rw-r--r--arch/x86/kvm/trace.h10
-rw-r--r--arch/x86/kvm/vmx/hyperv.c447
-rw-r--r--arch/x86/kvm/vmx/hyperv.h204
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.c315
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h166
-rw-r--r--arch/x86/kvm/vmx/nested.c162
-rw-r--r--arch/x86/kvm/vmx/nested.h3
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c20
-rw-r--r--arch/x86/kvm/vmx/run_flags.h7
-rw-r--r--arch/x86/kvm/vmx/sgx.c1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S13
-rw-r--r--arch/x86/kvm/vmx/vmx.c177
-rw-r--r--arch/x86/kvm/vmx/vmx.h14
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.c36
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h125
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h8
-rw-r--r--arch/x86/kvm/x86.c473
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/kvm/xen.c72
-rw-r--r--arch/x86/kvm/xen.h18
-rw-r--r--arch/x86/lib/Makefile13
-rw-r--r--arch/x86/lib/cache-smp.c1
-rw-r--r--arch/x86/lib/checksum_32.S2
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S2
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_uncached_64.S2
-rw-r--r--arch/x86/lib/csum-wrappers_64.c5
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/getuser.S26
-rw-r--r--arch/x86/lib/hweight.S22
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_32.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/lib/putuser.S23
-rw-r--r--arch/x86/lib/retpoline.S191
-rw-r--r--arch/x86/lib/x86-opcode-map.txt10
-rw-r--r--arch/x86/mm/fault.c48
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/x86/mm/maccess.c10
-rw-r--r--arch/x86/mm/mem_encrypt.c34
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c54
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c110
-rw-r--r--arch/x86/mm/numa.c66
-rw-r--r--arch/x86/mm/pat/memtype.c51
-rw-r--r--arch/x86/mm/pat/set_memory.c102
-rw-r--r--arch/x86/mm/pgtable.c5
-rw-r--r--arch/x86/mm/pti.c62
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c492
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2
-rw-r--r--arch/x86/pci/acpi.c3
-rw-r--r--arch/x86/pci/fixup.c48
-rw-r--r--arch/x86/pci/mmconfig-shared.c171
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/pci/mmconfig_64.c42
-rw-r--r--arch/x86/pci/pcbios.c28
-rw-r--r--arch/x86/pci/sta2x11-fixup.c1
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c2
-rw-r--r--arch/x86/platform/pvh/enlighten.c3
-rw-r--r--arch/x86/platform/pvh/head.S9
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c106
-rw-r--r--arch/x86/platform/uv/uv_time.c4
-rw-r--r--arch/x86/power/hibernate.c2
-rw-r--r--arch/x86/purgatory/Makefile3
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/realmode/rm/reboot.S3
-rw-r--r--arch/x86/tools/Makefile2
-rw-r--r--arch/x86/tools/chkobjdump.awk34
-rw-r--r--arch/x86/tools/objdump_reformat.awk6
-rw-r--r--arch/x86/tools/relocs.c19
-rw-r--r--arch/x86/um/asm/elf.h4
-rw-r--r--arch/x86/um/asm/processor_64.h3
-rw-r--r--arch/x86/um/os-Linux/Makefile1
-rw-r--r--arch/x86/um/os-Linux/prctl.c12
-rw-r--r--arch/x86/um/ptrace_32.c24
-rw-r--r--arch/x86/um/ptrace_64.c26
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_32.h4
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_user.h12
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h39
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h17
-rw-r--r--arch/x86/um/syscalls_64.c62
-rw-r--r--arch/x86/um/sysrq_64.c1
-rw-r--r--arch/x86/um/tls_64.c2
-rw-r--r--arch/x86/um/vdso/Makefile12
-rw-r--r--arch/x86/video/fbdev.c15
-rw-r--r--arch/x86/virt/Makefile2
-rw-r--r--arch/x86/virt/vmx/Makefile2
-rw-r--r--arch/x86/virt/vmx/tdx/Makefile2
-rw-r--r--arch/x86/virt/vmx/tdx/seamcall.S61
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c1492
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.h121
-rw-r--r--arch/x86/virt/vmx/tdx/tdxcall.S226
-rw-r--r--arch/x86/xen/apic.c10
-rw-r--r--arch/x86/xen/enlighten.c32
-rw-r--r--arch/x86/xen/enlighten_pvh.c68
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu_pv.c6
-rw-r--r--arch/x86/xen/setup.c44
-rw-r--r--arch/x86/xen/smp.c12
-rw-r--r--arch/x86/xen/xen-ops.h14
428 files changed, 12452 insertions, 7683 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe3292e31..bfccf1213 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,6 @@ config X86_64
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_PER_VMA_LOCK
- select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
@@ -60,8 +59,10 @@ config X86
#
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
+ select ARCH_CONFIGURES_CPU_MITIGATIONS
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
@@ -72,6 +73,7 @@ config X86
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
+ select ARCH_HAS_CPU_PASID if IOMMU_SVA
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -89,6 +91,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
@@ -118,6 +121,7 @@ config X86
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
@@ -147,6 +151,7 @@ config X86
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
+ select GENERIC_CPU_DEVICES
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
@@ -169,7 +174,7 @@ config X86
select HAS_IOPORT
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
- select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select HAVE_ALIGNED_STRUCT_PAGE
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_HUGE_VMALLOC if X86_64
@@ -384,10 +389,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
-config X86_32_SMP
- def_bool y
- depends on X86_32 && SMP
-
config X86_64_SMP
def_bool y
depends on X86_64 && SMP
@@ -1313,16 +1314,41 @@ config MICROCODE
def_bool y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
+config MICROCODE_INITRD32
+ def_bool y
+ depends on MICROCODE && X86_32 && BLK_DEV_INITRD
+
config MICROCODE_LATE_LOADING
bool "Late microcode loading (DANGEROUS)"
default n
- depends on MICROCODE
+ depends on MICROCODE && SMP
help
Loading microcode late, when the system is up and executing instructions
is a tricky business and should be avoided if possible. Just the sequence
of synchronizing all cores and SMT threads is one fragile dance which does
not guarantee that cores might not softlock after the loading. Therefore,
- use this at your own risk. Late loading taints the kernel too.
+ use this at your own risk. Late loading taints the kernel unless the
+ microcode header indicates that it is safe for late loading via the
+ minimal revision check. This minimal revision check can be enforced on
+ the kernel command line with "microcode.minrev=Y".
+
+config MICROCODE_LATE_FORCE_MINREV
+ bool "Enforce late microcode loading minimal revision check"
+ default n
+ depends on MICROCODE_LATE_LOADING
+ help
+ To prevent that users load microcode late which modifies already
+ in use features, newer microcode patches have a minimum revision field
+ in the microcode header, which tells the kernel which minimum
+ revision must be active in the CPU to safely load that new microcode
+ late into the running system. If disabled the check will not
+ be enforced but the kernel will be tainted when the minimal
+ revision check fails.
+
+ This minimal revision check can also be controlled via the
+ "microcode.minrev" parameter on the kernel command line.
+
+ If unsure say Y.
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -1390,7 +1416,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
- depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
+ depends on X86_HAVE_PAE
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
@@ -1447,7 +1473,7 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
- depends on X86_32 && !HIGHMEM4G
+ depends on X86_32 && X86_HAVE_PAE
select PHYS_ADDR_T_64BIT
select SWIOTLB
help
@@ -1514,19 +1540,6 @@ config AMD_MEM_ENCRYPT
This requires an AMD processor that supports Secure Memory
Encryption (SME).
-config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
- bool "Activate AMD Secure Memory Encryption (SME) by default"
- depends on AMD_MEM_ENCRYPT
- help
- Say yes to have system memory encrypted by default if running on
- an AMD processor that supports Secure Memory Encryption (SME).
-
- If set to Y, then the encryption of system memory can be
- deactivated with the mem_encrypt=off command line option.
-
- If set to N, then the encryption of system memory can be
- activated with the mem_encrypt=on command line option.
-
# Common NUMA Features
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
@@ -1534,6 +1547,7 @@ config NUMA
depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
default y if X86_BIGSMP
select USE_PERCPU_NUMA_NODE_ID
+ select OF_NUMA if OF
help
Enable NUMA (Non-Uniform Memory Access) support.
@@ -1939,6 +1953,23 @@ config X86_USER_SHADOW_STACK
If unsure, say N.
+config INTEL_TDX_HOST
+ bool "Intel Trust Domain Extensions (TDX) host support"
+ depends on CPU_SUP_INTEL
+ depends on X86_64
+ depends on KVM_INTEL
+ depends on X86_X2APIC
+ select ARCH_KEEP_MEMBLOCK
+ depends on CONTIG_ALLOC
+ depends on !KEXEC_CORE
+ depends on X86_MCE
+ help
+ Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
+ host and certain physical attacks. This option enables necessary TDX
+ support in the host kernel to run confidential VMs.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2062,6 +2093,9 @@ config ARCH_SUPPORTS_CRASH_DUMP
config ARCH_SUPPORTS_CRASH_HOTPLUG
def_bool y
+config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ def_bool CRASH_CORE
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
default "0x1000000"
@@ -2434,17 +2468,21 @@ config PREFIX_SYMBOLS
def_bool y
depends on CALL_PADDING && !CFI_CLANG
-menuconfig SPECULATION_MITIGATIONS
- bool "Mitigations for speculative execution vulnerabilities"
+menuconfig CPU_MITIGATIONS
+ bool "Mitigations for CPU vulnerabilities"
default y
help
- Say Y here to enable options which enable mitigations for
- speculative execution hardware vulnerabilities.
+ Say Y here to enable options which enable mitigations for hardware
+ vulnerabilities (usually related to speculative execution).
+ Mitigations can be disabled or restricted to SMT systems at runtime
+ via the "mitigations" kernel parameter.
- If you say N, all mitigations will be disabled. You really
- should know what you are doing to say so.
+ If you say N, all mitigations will be disabled. This CANNOT be
+ overridden at runtime.
-if SPECULATION_MITIGATIONS
+ Say 'Y', unless you really know what you are doing.
+
+if CPU_MITIGATIONS
config PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
@@ -2568,6 +2606,27 @@ config GDS_FORCE_MITIGATION
If in doubt, say N.
+config MITIGATION_RFDS
+ bool "RFDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Register File Data Sampling (RFDS) by default.
+ RFDS is a hardware vulnerability which affects Intel Atom CPUs. It
+ allows unprivileged speculative access to stale data previously
+ stored in floating point, vector and integer registers.
+ See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst>
+
+config MITIGATION_SPECTRE_BHI
+ bool "Mitigate Spectre-BHB (Branch History Injection)"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks
+ where the branch history buffer is poisoned to speculatively steer
+ indirect branches.
+ See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
endif
config ARCH_HAS_ADD_PAGES
@@ -2954,6 +3013,15 @@ config IA32_EMULATION
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
+config IA32_EMULATION_DEFAULT_DISABLED
+ bool "IA32 emulation disabled by default"
+ default n
+ depends on IA32_EMULATION
+ help
+ Make IA32 emulation disabled by default. This prevents loading 32-bit
+ processes and access to 32-bit syscalls. If unsure, leave it to its
+ default value.
+
config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 00468adf1..2a7279d80 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,9 +362,13 @@ config X86_TSC
def_bool y
depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+config X86_HAVE_PAE
+ def_bool y
+ depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64
+
config X86_CMPXCHG64
def_bool y
- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
# this should be set for all -march=.. options where the compiler
# generates cmov.
@@ -375,7 +379,7 @@ config X86_CMOV
config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8)
default "5" if X86_32 && X86_CMPXCHG64
default "4"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5bfe5caaa..da8f3caf2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -43,7 +43,7 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
@@ -81,6 +81,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y)
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
#
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
+KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables
else
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
endif
@@ -111,13 +112,13 @@ ifeq ($(CONFIG_X86_32),y)
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
- ifeq ($(CONFIG_STACKPROTECTOR),y)
- ifeq ($(CONFIG_SMP),y)
+ ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
- else
+ else
KBUILD_CFLAGS += -mstack-protector-guard=global
- endif
- endif
+ endif
+ endif
else
BITS := 64
UTS_MACHINE := x86_64
@@ -252,6 +253,8 @@ archheaders:
libs-y += arch/x86/lib/
+core-y += arch/x86/virt/
+
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
@@ -291,9 +294,10 @@ PHONY += install
install:
$(call cmd,install)
-PHONY += vdso_install
-vdso_install:
- $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
+vdso-install-$(CONFIG_X86_64) += arch/x86/entry/vdso/vdso64.so.dbg
+vdso-install-$(CONFIG_X86_X32_ABI) += arch/x86/entry/vdso/vdsox32.so.dbg
+vdso-install-$(CONFIG_X86_32) += arch/x86/entry/vdso/vdso32.so.dbg
+vdso-install-$(CONFIG_IA32_EMULATION) += arch/x86/entry/vdso/vdso32.so.dbg
archprepare: checkbin
checkbin:
diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink
index 936093d29..fef2e977c 100644
--- a/arch/x86/Makefile.postlink
+++ b/arch/x86/Makefile.postlink
@@ -34,9 +34,6 @@ ifeq ($(CONFIG_X86_NEED_RELOCS),y)
$(call cmd,strip_relocs)
endif
-%.ko: FORCE
- @true
-
clean:
@rm -f $(OUT_RELOCS)/vmlinux.relocs
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f33e45ed1..3cece19b7 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -89,7 +89,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|efi32_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 71fc531b9..e9522c689 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -53,7 +53,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-# sev.c indirectly inludes inat-table.h which is generated during
+# sev.c indirectly includes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
@@ -84,7 +84,7 @@ LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 9caf89063..18d15d1ce 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -30,13 +30,13 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
* Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
* ACPI_TABLE_GUID because it has more features.
*/
- rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
ACPI_20_TABLE_GUID);
if (rsdp_addr)
return (acpi_physical_address)rsdp_addr;
/* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
- rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
ACPI_TABLE_GUID);
if (rsdp_addr)
return (acpi_physical_address)rsdp_addr;
@@ -56,15 +56,15 @@ static acpi_physical_address efi_get_rsdp_addr(void)
enum efi_type et;
int ret;
- et = efi_get_type(boot_params);
+ et = efi_get_type(boot_params_ptr);
if (et == EFI_TYPE_NONE)
return 0;
- systab_pa = efi_get_system_table(boot_params);
+ systab_pa = efi_get_system_table(boot_params_ptr);
if (!systab_pa)
error("EFI support advertised, but unable to locate system table.");
- ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len);
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len);
if (ret || !cfg_tbl_pa)
error("EFI config table not found.");
@@ -156,7 +156,7 @@ acpi_physical_address get_rsdp_addr(void)
{
acpi_physical_address pa;
- pa = boot_params->acpi_rsdp_addr;
+ pa = boot_params_ptr->acpi_rsdp_addr;
if (!pa)
pa = efi_get_rsdp_addr();
@@ -178,7 +178,7 @@ static unsigned long get_cmdline_acpi_rsdp(void)
{
unsigned long addr = 0;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
char val[MAX_ADDR_LEN] = { };
int ret;
@@ -210,7 +210,7 @@ static unsigned long get_acpi_srat_table(void)
rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp();
if (!rsdp)
rsdp = (struct acpi_table_rsdp *)(long)
- boot_params->acpi_rsdp_addr;
+ boot_params_ptr->acpi_rsdp_addr;
if (!rsdp)
return 0;
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index f1add5d85..c1bb18097 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -14,9 +14,9 @@ static inline char rdfs8(addr_t addr)
#include "../cmdline.c"
unsigned long get_cmd_line_ptr(void)
{
- unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
+ unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr;
- cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
+ cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32;
return cmd_line_ptr;
}
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
index f4e22ef77..876fc6d46 100644
--- a/arch/x86/boot/compressed/efi_mixed.S
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -15,10 +15,12 @@
*/
#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
+#include <asm/setup.h>
.code64
.text
@@ -49,6 +51,11 @@ SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
+
+ /* Switch to the firmware's stack */
+ movl efi32_boot_sp(%rip), %esp
+ andl $~7, %esp
+
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
@@ -144,6 +151,7 @@ SYM_FUNC_END(__efi64_thunk)
SYM_FUNC_START(efi32_stub_entry)
call 1f
1: popl %ecx
+ leal (efi32_boot_args - 1b)(%ecx), %ebx
/* Clear BSS */
xorl %eax, %eax
@@ -158,6 +166,7 @@ SYM_FUNC_START(efi32_stub_entry)
popl %ecx
popl %edx
popl %esi
+ movl %esi, 8(%ebx)
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
@@ -234,8 +243,6 @@ SYM_FUNC_END(efi_enter32)
*
* Arguments: %ecx image handle
* %edx EFI system table pointer
- * %esi struct bootparams pointer (or NULL when not using
- * the EFI handover protocol)
*
* Since this is the point of no return for ordinary execution, no registers
* are considered live except for the function parameters. [Note that the EFI
@@ -254,13 +261,25 @@ SYM_FUNC_START_LOCAL(efi32_entry)
/* Store firmware IDT descriptor */
sidtl (efi32_boot_idt - 1b)(%ebx)
+ /* Store firmware stack pointer */
+ movl %esp, (efi32_boot_sp - 1b)(%ebx)
+
/* Store boot arguments */
leal (efi32_boot_args - 1b)(%ebx), %ebx
movl %ecx, 0(%ebx)
movl %edx, 4(%ebx)
- movl %esi, 8(%ebx)
movb $0x0, 12(%ebx) // efi_is64
+ /*
+ * Allocate some memory for a temporary struct boot_params, which only
+ * needs the minimal pieces that startup_32() relies on.
+ */
+ subl $PARAM_SIZE, %esp
+ movl %esp, %esi
+ movl $PAGE_SIZE, BP_kernel_alignment(%esi)
+ movl $_end - 1b, BP_init_size(%esi)
+ subl $startup_32 - 1b, BP_init_size(%esi)
+
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
@@ -286,8 +305,7 @@ SYM_FUNC_START(efi32_pe_entry)
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
- xorl %esi, %esi
- jmp efi32_entry // pass %ecx, %edx, %esi
+ jmp efi32_entry // pass %ecx, %edx
// no other registers remain live
2: popl %edi // restore callee-save registers
@@ -318,5 +336,6 @@ SYM_DATA_END(efi32_boot_idt)
SYM_DATA_LOCAL(efi32_boot_cs, .word 0)
SYM_DATA_LOCAL(efi32_boot_ds, .word 0)
+SYM_DATA_LOCAL(efi32_boot_sp, .long 0)
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index bf4a10a57..1dcb794c5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,6 +398,11 @@ SYM_CODE_START(startup_64)
call sev_enable
#endif
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
/*
* configure_5level_paging() updates the number of paging levels using
* a trampoline in 32-bit addressable memory if the current number does
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index 08f93b040..d040080d7 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -159,8 +159,9 @@ void initialize_identity_maps(void *rmode)
* or does not touch all the pages covering them.
*/
kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
- boot_params = rmode;
- kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
+ boot_params_ptr = rmode;
+ kernel_add_identity_map((unsigned long)boot_params_ptr,
+ (unsigned long)(boot_params_ptr + 1));
cmdline = get_cmd_line_ptr();
kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
@@ -168,7 +169,7 @@ void initialize_identity_maps(void *rmode)
* Also map the setup_data entries passed via boot_params in case they
* need to be accessed by uncompressed kernel via the identity mapping.
*/
- sd = (struct setup_data *)boot_params->hdr.setup_data;
+ sd = (struct setup_data *)boot_params_ptr->hdr.setup_data;
while (sd) {
unsigned long sd_addr = (unsigned long)sd;
@@ -385,3 +386,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
*/
kernel_add_identity_map(address, end);
}
+
+void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
+{
+ /* Empty handler to ignore NMI during early boot */
+}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 3cdf94b41..d100284bb 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -61,6 +61,7 @@ void load_stage2_idt(void)
boot_idt_desc.address = (unsigned long)boot_idt;
set_idt_entry(X86_TRAP_PF, boot_page_fault);
+ set_idt_entry(X86_TRAP_NMI, boot_nmi_trap);
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S
index 22890e199..4d03c8562 100644
--- a/arch/x86/boot/compressed/idt_handlers_64.S
+++ b/arch/x86/boot/compressed/idt_handlers_64.S
@@ -70,6 +70,7 @@ SYM_FUNC_END(\name)
.code64
EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
+EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0
#ifdef CONFIG_AMD_MEM_ENCRYPT
EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 9193acf0e..dec961c6d 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -63,7 +63,7 @@ static unsigned long get_boot_seed(void)
unsigned long hash = 0;
hash = rotate_xor(hash, build_str, sizeof(build_str));
- hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
+ hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr));
return hash;
}
@@ -383,7 +383,7 @@ static void handle_mem_options(void)
static void mem_avoid_init(unsigned long input, unsigned long input_size,
unsigned long output)
{
- unsigned long init_size = boot_params->hdr.init_size;
+ unsigned long init_size = boot_params_ptr->hdr.init_size;
u64 initrd_start, initrd_size;
unsigned long cmd_line, cmd_line_size;
@@ -395,10 +395,10 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
/* Avoid initrd. */
- initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
- initrd_start |= boot_params->hdr.ramdisk_image;
- initrd_size = (u64)boot_params->ext_ramdisk_size << 32;
- initrd_size |= boot_params->hdr.ramdisk_size;
+ initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32;
+ initrd_start |= boot_params_ptr->hdr.ramdisk_image;
+ initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32;
+ initrd_size |= boot_params_ptr->hdr.ramdisk_size;
mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
/* No need to set mapping for initrd, it will be handled in VO. */
@@ -413,8 +413,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
}
/* Avoid boot parameters. */
- mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
- mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
+ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr;
+ mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr);
/* We don't need to set a mapping for setup_data. */
@@ -447,7 +447,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
}
/* Avoid all entries in the setup_data linked list. */
- ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
while (ptr) {
struct mem_vector avoid;
@@ -706,7 +706,7 @@ static inline bool memory_type_is_free(efi_memory_desc_t *md)
static bool
process_efi_entries(unsigned long minimum, unsigned long image_size)
{
- struct efi_info *e = &boot_params->efi_info;
+ struct efi_info *e = &boot_params_ptr->efi_info;
bool efi_mirror_found = false;
struct mem_vector region;
efi_memory_desc_t *md;
@@ -777,8 +777,8 @@ static void process_e820_entries(unsigned long minimum,
struct boot_e820_entry *entry;
/* Verify potential e820 positions, appending to slots list. */
- for (i = 0; i < boot_params->e820_entries; i++) {
- entry = &boot_params->e820_table[i];
+ for (i = 0; i < boot_params_ptr->e820_entries; i++) {
+ entry = &boot_params_ptr->e820_table[i];
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
continue;
@@ -852,7 +852,7 @@ void choose_random_location(unsigned long input,
return;
}
- boot_params->hdr.loadflags |= KASLR_FLAG;
+ boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
if (IS_ENABLED(CONFIG_X86_32))
mem_limit = KERNEL_IMAGE_SIZE;
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
index 3c1609245..dbba332e4 100644
--- a/arch/x86/boot/compressed/mem.c
+++ b/arch/x86/boot/compressed/mem.c
@@ -8,7 +8,7 @@
/*
* accept_memory() and process_unaccepted_memory() called from EFI stub which
- * runs before decompresser and its early_tdx_detect().
+ * runs before decompressor and its early_tdx_detect().
*
* Enumerate TDX directly from the early users.
*/
@@ -54,17 +54,17 @@ bool init_unaccepted_memory(void)
enum efi_type et;
int ret;
- et = efi_get_type(boot_params);
+ et = efi_get_type(boot_params_ptr);
if (et == EFI_TYPE_NONE)
return false;
- ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len);
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len);
if (ret) {
warn("EFI config table not found.");
return false;
}
- table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa,
+ table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa,
cfg_table_len, guid);
if (!table)
return false;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f711f2a85..ee0fac468 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -46,7 +46,7 @@ void *memmove(void *dest, const void *src, size_t n);
/*
* This is set up by the setup-routine at boot-time
*/
-struct boot_params *boot_params;
+struct boot_params *boot_params_ptr;
struct port_io_ops pio_ops;
@@ -132,8 +132,8 @@ void __putstr(const char *s)
if (lines == 0 || cols == 0)
return;
- x = boot_params->screen_info.orig_x;
- y = boot_params->screen_info.orig_y;
+ x = boot_params_ptr->screen_info.orig_x;
+ y = boot_params_ptr->screen_info.orig_y;
while ((c = *s++) != '\0') {
if (c == '\n') {
@@ -154,8 +154,8 @@ void __putstr(const char *s)
}
}
- boot_params->screen_info.orig_x = x;
- boot_params->screen_info.orig_y = y;
+ boot_params_ptr->screen_info.orig_x = x;
+ boot_params_ptr->screen_info.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
outb(14, vidport);
@@ -330,6 +330,7 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
+const unsigned long kernel_text_size = VO___start_rodata - VO__text;
const unsigned long kernel_total_size = VO__end - VO__text;
static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
@@ -358,6 +359,19 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
}
/*
+ * Set the memory encryption xloadflag based on the mem_encrypt= command line
+ * parameter, if provided.
+ */
+static void parse_mem_encrypt(struct setup_header *hdr)
+{
+ int on = cmdline_find_option_bool("mem_encrypt=on");
+ int off = cmdline_find_option_bool("mem_encrypt=off");
+
+ if (on > off)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+}
+
+/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
* image (VO) and the execution environment (.bss, .brk), which makes sure
@@ -382,14 +396,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
size_t entry_offset;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
- boot_params = rmode;
+ boot_params_ptr = rmode;
/* Clear flags intended for solely in-kernel use. */
- boot_params->hdr.loadflags &= ~KASLR_FLAG;
+ boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
+
+ parse_mem_encrypt(&boot_params_ptr->hdr);
- sanitize_boot_params(boot_params);
+ sanitize_boot_params(boot_params_ptr);
- if (boot_params->screen_info.orig_video_mode == 7) {
+ if (boot_params_ptr->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
@@ -397,8 +413,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
vidport = 0x3d4;
}
- lines = boot_params->screen_info.orig_video_lines;
- cols = boot_params->screen_info.orig_video_cols;
+ lines = boot_params_ptr->screen_info.orig_video_lines;
+ cols = boot_params_ptr->screen_info.orig_video_cols;
init_default_io_ops();
@@ -417,7 +433,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
* so that early debugging output from the RSDP parsing code can be
* collected.
*/
- boot_params->acpi_rsdp_addr = get_rsdp_addr();
+ boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr();
debug_putstr("early console in extract_kernel\n");
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index cc70d3fb9..bc2f0f17f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -61,7 +61,6 @@ extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
void *malloc(int size);
void free(void *where);
-extern struct boot_params *boot_params;
void __putstr(const char *s);
void __puthex(unsigned long value);
#define error_putstr(__x) __putstr(__x)
@@ -197,6 +196,7 @@ static inline void cleanup_exception_handling(void) { }
/* IDT Entry Points */
void boot_page_fault(void);
+void boot_nmi_trap(void);
void boot_stage1_vc(void);
void boot_stage2_vc(void);
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 7939eb6e6..51f957b24 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -28,7 +28,6 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
*/
unsigned long *trampoline_32bit __section(".data");
-extern struct boot_params *boot_params;
int cmdline_find_option_bool(const char *option);
static unsigned long find_trampoline_placement(void)
@@ -49,7 +48,7 @@ static unsigned long find_trampoline_placement(void)
*
* Only look for values in the legacy ROM for non-EFI system.
*/
- signature = (char *)&boot_params->efi_info.efi_loader_signature;
+ signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature;
if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
ebda_start = *(unsigned short *)0x40e << 4;
@@ -65,10 +64,10 @@ static unsigned long find_trampoline_placement(void)
bios_start = round_down(bios_start, PAGE_SIZE);
/* Find the first usable memory region under bios_start. */
- for (i = boot_params->e820_entries - 1; i >= 0; i--) {
+ for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) {
unsigned long new = bios_start;
- entry = &boot_params->e820_table[i];
+ entry = &boot_params_ptr->e820_table[i];
/* Skip all entries above bios_start. */
if (bios_start <= entry->addr)
@@ -107,7 +106,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
- boot_params = bp;
+ boot_params_ptr = bp;
/*
* Check if LA57 is desired and supported.
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 80d76aea1..4c9b75281 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -25,7 +25,7 @@
#include "error.h"
#include "../msr.h"
-struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
/*
@@ -116,6 +116,9 @@ static bool fault_in_kernel_space(unsigned long address)
#undef __init
#define __init
+#undef __head
+#define __head
+
#define __BOOT_COMPRESSED
/* Basic instruction decoding support needed */
@@ -615,7 +618,7 @@ void sev_prep_identity_maps(unsigned long top_level_pgt)
* accessed after switchover.
*/
if (sev_snp_enabled()) {
- unsigned long cc_info_pa = boot_params->cc_blob_address;
+ unsigned long cc_info_pa = boot_params_ptr->cc_blob_address;
struct cc_blob_sev_info *cc_info;
kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
index 8841b945a..8451d6a10 100644
--- a/arch/x86/boot/compressed/tdx.c
+++ b/arch/x86/boot/compressed/tdx.c
@@ -18,7 +18,7 @@ void __tdx_hypercall_failed(void)
static inline unsigned int tdx_io_in(int size, u16 port)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
.r12 = size,
@@ -26,7 +26,7 @@ static inline unsigned int tdx_io_in(int size, u16 port)
.r14 = port,
};
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return UINT_MAX;
return args.r11;
@@ -34,7 +34,7 @@ static inline unsigned int tdx_io_in(int size, u16 port)
static inline void tdx_io_out(int size, u16 port, u32 value)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
.r12 = size,
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index b22f34b86..083ec6d77 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -43,11 +43,13 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
- .data : {
+ .data : ALIGN(0x1000) {
_data = . ;
*(.data)
*(.data.*)
- *(.bss.efistub)
+
+ /* Add 4 bytes of extra space for a CRC-32 checksum */
+ . = ALIGN(. + 4, 0x200);
_edata = . ;
}
. = ALIGN(L1_CACHE_BYTES);
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b04ca8e2b..a1bbedd98 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -36,66 +36,20 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
#define ROOT_RDONLY 1
#endif
+ .set salign, 0x1000
+ .set falign, 0x200
+
.code16
.section ".bstext", "ax"
-
- .global bootsect_start
-bootsect_start:
#ifdef CONFIG_EFI_STUB
# "MZ", MS-DOS header
.word MZ_MAGIC
-#endif
-
- # Normalize the start address
- ljmp $BOOTSEG, $start2
-
-start2:
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- xorw %sp, %sp
- sti
- cld
-
- movw $bugger_off_msg, %si
-
-msg_loop:
- lodsb
- andb %al, %al
- jz bs_die
- movb $0xe, %ah
- movw $7, %bx
- int $0x10
- jmp msg_loop
-
-bs_die:
- # Allow the user to press a key, then reboot
- xorw %ax, %ax
- int $0x16
- int $0x19
-
- # int 0x19 should never return. In case it does anyway,
- # invoke the BIOS reset code...
- ljmp $0xf000,$0xfff0
-
-#ifdef CONFIG_EFI_STUB
.org 0x38
#
# Offset to the PE header.
#
.long LINUX_PE_MAGIC
.long pe_header
-#endif /* CONFIG_EFI_STUB */
-
- .section ".bsdata", "a"
-bugger_off_msg:
- .ascii "Use a boot loader.\r\n"
- .ascii "\n"
- .ascii "Remove disk and press any key to reboot...\r\n"
- .byte 0
-
-#ifdef CONFIG_EFI_STUB
pe_header:
.long PE_MAGIC
@@ -124,30 +78,26 @@ optional_header:
.byte 0x02 # MajorLinkerVersion
.byte 0x14 # MinorLinkerVersion
- # Filled in by build.c
- .long 0 # SizeOfCode
+ .long ZO__data # SizeOfCode
- .long 0 # SizeOfInitializedData
+ .long ZO__end - ZO__data # SizeOfInitializedData
.long 0 # SizeOfUninitializedData
- # Filled in by build.c
- .long 0x0000 # AddressOfEntryPoint
+ .long setup_size + ZO_efi_pe_entry # AddressOfEntryPoint
- .long 0x0200 # BaseOfCode
+ .long setup_size # BaseOfCode
#ifdef CONFIG_X86_32
.long 0 # data
#endif
extra_header_fields:
- # PE specification requires ImageBase to be 64k aligned
- .set image_base, (LOAD_PHYSICAL_ADDR + 0xffff) & ~0xffff
#ifdef CONFIG_X86_32
- .long image_base # ImageBase
+ .long 0 # ImageBase
#else
- .quad image_base # ImageBase
+ .quad 0 # ImageBase
#endif
- .long 0x20 # SectionAlignment
- .long 0x20 # FileAlignment
+ .long salign # SectionAlignment
+ .long falign # FileAlignment
.word 0 # MajorOperatingSystemVersion
.word 0 # MinorOperatingSystemVersion
.word LINUX_EFISTUB_MAJOR_VERSION # MajorImageVersion
@@ -156,12 +106,9 @@ extra_header_fields:
.word 0 # MinorSubsystemVersion
.long 0 # Win32VersionValue
- #
- # The size of the bzImage is written in tools/build.c
- #
- .long 0 # SizeOfImage
+ .long setup_size + ZO__end # SizeOfImage
- .long 0x200 # SizeOfHeaders
+ .long salign # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES
@@ -192,87 +139,77 @@ extra_header_fields:
# Section table
section_table:
- #
- # The offset & size fields are filled in by build.c.
- #
.ascii ".setup"
.byte 0
.byte 0
- .long 0
- .long 0x0 # startup_{32,64}
- .long 0 # Size of initialized data
- # on disk
- .long 0x0 # startup_{32,64}
- .long 0 # PointerToRelocations
- .long 0 # PointerToLineNumbers
- .word 0 # NumberOfRelocations
- .word 0 # NumberOfLineNumbers
- .long IMAGE_SCN_CNT_CODE | \
- IMAGE_SCN_MEM_READ | \
- IMAGE_SCN_MEM_EXECUTE | \
- IMAGE_SCN_ALIGN_16BYTES # Characteristics
+ .long pecompat_fstart - salign # VirtualSize
+ .long salign # VirtualAddress
+ .long pecompat_fstart - salign # SizeOfRawData
+ .long salign # PointerToRawData
- #
- # The EFI application loader requires a relocation section
- # because EFI applications must be relocatable. The .reloc
- # offset & size fields are filled in by build.c.
- #
- .ascii ".reloc"
- .byte 0
- .byte 0
- .long 0
- .long 0
- .long 0 # SizeOfRawData
- .long 0 # PointerToRawData
- .long 0 # PointerToRelocations
- .long 0 # PointerToLineNumbers
- .word 0 # NumberOfRelocations
- .word 0 # NumberOfLineNumbers
+ .long 0, 0, 0
.long IMAGE_SCN_CNT_INITIALIZED_DATA | \
IMAGE_SCN_MEM_READ | \
- IMAGE_SCN_MEM_DISCARDABLE | \
- IMAGE_SCN_ALIGN_1BYTES # Characteristics
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
#ifdef CONFIG_EFI_MIXED
- #
- # The offset & size fields are filled in by build.c.
- #
.asciz ".compat"
- .long 0
- .long 0x0
- .long 0 # Size of initialized data
- # on disk
- .long 0x0
- .long 0 # PointerToRelocations
- .long 0 # PointerToLineNumbers
- .word 0 # NumberOfRelocations
- .word 0 # NumberOfLineNumbers
+
+ .long pecompat_fsize # VirtualSize
+ .long pecompat_fstart # VirtualAddress
+ .long pecompat_fsize # SizeOfRawData
+ .long pecompat_fstart # PointerToRawData
+
+ .long 0, 0, 0
.long IMAGE_SCN_CNT_INITIALIZED_DATA | \
IMAGE_SCN_MEM_READ | \
- IMAGE_SCN_MEM_DISCARDABLE | \
- IMAGE_SCN_ALIGN_1BYTES # Characteristics
+ IMAGE_SCN_MEM_DISCARDABLE # Characteristics
+
+ /*
+ * Put the IA-32 machine type and the associated entry point address in
+ * the .compat section, so loaders can figure out which other execution
+ * modes this image supports.
+ */
+ .pushsection ".pecompat", "a", @progbits
+ .balign salign
+ .globl pecompat_fstart
+pecompat_fstart:
+ .byte 0x1 # Version
+ .byte 8 # Size
+ .word IMAGE_FILE_MACHINE_I386 # PE machine type
+ .long setup_size + ZO_efi32_pe_entry # Entrypoint
+ .byte 0x0 # Sentinel
+ .popsection
+#else
+ .set pecompat_fstart, setup_size
#endif
-
- #
- # The offset & size fields are filled in by build.c.
- #
.ascii ".text"
.byte 0
.byte 0
.byte 0
- .long 0
- .long 0x0 # startup_{32,64}
- .long 0 # Size of initialized data
+ .long ZO__data
+ .long setup_size
+ .long ZO__data # Size of initialized data
# on disk
- .long 0x0 # startup_{32,64}
+ .long setup_size
.long 0 # PointerToRelocations
.long 0 # PointerToLineNumbers
.word 0 # NumberOfRelocations
.word 0 # NumberOfLineNumbers
.long IMAGE_SCN_CNT_CODE | \
IMAGE_SCN_MEM_READ | \
- IMAGE_SCN_MEM_EXECUTE | \
- IMAGE_SCN_ALIGN_16BYTES # Characteristics
+ IMAGE_SCN_MEM_EXECUTE # Characteristics
+
+ .ascii ".data\0\0\0"
+ .long ZO__end - ZO__data # VirtualSize
+ .long setup_size + ZO__data # VirtualAddress
+ .long ZO__edata - ZO__data # SizeOfRawData
+ .long setup_size + ZO__data # PointerToRawData
+
+ .long 0, 0, 0
+ .long IMAGE_SCN_CNT_INITIALIZED_DATA | \
+ IMAGE_SCN_MEM_READ | \
+ IMAGE_SCN_MEM_WRITE # Characteristics
.set section_count, (. - section_table) / 40
#endif /* CONFIG_EFI_STUB */
@@ -286,12 +223,12 @@ sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */
.globl hdr
hdr:
-setup_sects: .byte 0 /* Filled in by build.c */
+ .byte setup_sects - 1
root_flags: .word ROOT_RDONLY
-syssize: .long 0 /* Filled in by build.c */
+syssize: .long ZO__edata / 16
ram_size: .word 0 /* Obsolete */
vid_mode: .word SVGA_MODE
-root_dev: .word 0 /* Filled in by build.c */
+root_dev: .word 0 /* Default to major/minor 0/0 */
boot_flag: .word 0xAA55
# offset 512, entry point
@@ -579,9 +516,25 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
# define INIT_SIZE VO_INIT_SIZE
#endif
+ .macro __handover_offset
+#ifndef CONFIG_EFI_HANDOVER_PROTOCOL
+ .long 0
+#elif !defined(CONFIG_X86_64)
+ .long ZO_efi32_stub_entry
+#else
+ /* Yes, this is really how we defined it :( */
+ .long ZO_efi64_stub_entry - 0x200
+#ifdef CONFIG_EFI_MIXED
+ .if ZO_efi32_stub_entry != ZO_efi64_stub_entry - 0x200
+ .error "32-bit and 64-bit EFI entry points do not match"
+ .endif
+#endif
+#endif
+ .endm
+
init_size: .long INIT_SIZE # kernel initialization size
-handover_offset: .long 0 # Filled in by build.c
-kernel_info_offset: .long 0 # Filled in by build.c
+handover_offset: __handover_offset
+kernel_info_offset: .long ZO_kernel_info
# End of setup header #####################################################
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 40031a614..5941f930f 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -11,6 +11,7 @@
*/
#include "boot.h"
+#include <asm/desc_defs.h>
#include <asm/segment.h>
/*
@@ -67,13 +68,13 @@ static void setup_gdt(void)
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),
/* DS: data, read/write, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff),
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
we don't actually use it for anything. */
- [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
+ [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103),
};
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 49546c247..3a2d1360a 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -10,10 +10,11 @@ ENTRY(_start)
SECTIONS
{
. = 0;
- .bstext : { *(.bstext) }
- .bsdata : { *(.bsdata) }
+ .bstext : {
+ *(.bstext)
+ . = 495;
+ } =0xffffffff
- . = 495;
.header : { *(.header) }
.entrytext : { *(.entrytext) }
.inittext : { *(.inittext) }
@@ -23,6 +24,9 @@ SECTIONS
.text : { *(.text .text.*) }
.text32 : { *(.text32) }
+ .pecompat : { *(.pecompat) }
+ PROVIDE(pecompat_fsize = setup_size - pecompat_fstart);
+
. = ALIGN(16);
.rodata : { *(.rodata*) }
@@ -38,8 +42,10 @@ SECTIONS
.signature : {
setup_sig = .;
LONG(0x5a5aaa55)
- }
+ setup_size = ALIGN(ABSOLUTE(.), 4096);
+ setup_sects = ABSOLUTE(setup_size / 512);
+ }
. = ALIGN(16);
.bss :
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 1c8541ae3..c23f3b9c8 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -49,7 +49,7 @@ int strcmp(const char *str1, const char *str2)
{
const unsigned char *s1 = (const unsigned char *)str1;
const unsigned char *s2 = (const unsigned char *)str2;
- int delta = 0;
+ int delta;
while (*s1 || *s2) {
delta = *s1 - *s2;
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index bd247692b..10311d77c 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -40,10 +40,6 @@ typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
-#define DEFAULT_MAJOR_ROOT 0
-#define DEFAULT_MINOR_ROOT 0
-#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT)
-
/* Minimal number of setup sectors */
#define SETUP_SECT_MIN 5
#define SETUP_SECT_MAX 64
@@ -51,22 +47,7 @@ typedef unsigned int u32;
/* This must be large enough to hold the entire setup */
u8 buf[SETUP_SECT_MAX*512];
-#define PECOFF_RELOC_RESERVE 0x20
-
-#ifdef CONFIG_EFI_MIXED
-#define PECOFF_COMPAT_RESERVE 0x20
-#else
-#define PECOFF_COMPAT_RESERVE 0x0
-#endif
-
-static unsigned long efi32_stub_entry;
-static unsigned long efi64_stub_entry;
-static unsigned long efi_pe_entry;
-static unsigned long efi32_pe_entry;
-static unsigned long kernel_info;
-static unsigned long startup_64;
-static unsigned long _ehead;
-static unsigned long _end;
+static unsigned long _edata;
/*----------------------------------------------------------------------*/
@@ -152,180 +133,6 @@ static void usage(void)
die("Usage: build setup system zoffset.h image");
}
-#ifdef CONFIG_EFI_STUB
-
-static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
-{
- unsigned int pe_header;
- unsigned short num_sections;
- u8 *section;
-
- pe_header = get_unaligned_le32(&buf[0x3c]);
- num_sections = get_unaligned_le16(&buf[pe_header + 6]);
-
-#ifdef CONFIG_X86_32
- section = &buf[pe_header + 0xa8];
-#else
- section = &buf[pe_header + 0xb8];
-#endif
-
- while (num_sections > 0) {
- if (strncmp((char*)section, section_name, 8) == 0) {
- /* section header size field */
- put_unaligned_le32(size, section + 0x8);
-
- /* section header vma field */
- put_unaligned_le32(vma, section + 0xc);
-
- /* section header 'size of initialised data' field */
- put_unaligned_le32(datasz, section + 0x10);
-
- /* section header 'file offset' field */
- put_unaligned_le32(offset, section + 0x14);
-
- break;
- }
- section += 0x28;
- num_sections--;
- }
-}
-
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
-{
- update_pecoff_section_header_fields(section_name, offset, size, size, offset);
-}
-
-static void update_pecoff_setup_and_reloc(unsigned int size)
-{
- u32 setup_offset = 0x200;
- u32 reloc_offset = size - PECOFF_RELOC_RESERVE - PECOFF_COMPAT_RESERVE;
-#ifdef CONFIG_EFI_MIXED
- u32 compat_offset = reloc_offset + PECOFF_RELOC_RESERVE;
-#endif
- u32 setup_size = reloc_offset - setup_offset;
-
- update_pecoff_section_header(".setup", setup_offset, setup_size);
- update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE);
-
- /*
- * Modify .reloc section contents with a single entry. The
- * relocation is applied to offset 10 of the relocation section.
- */
- put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]);
- put_unaligned_le32(10, &buf[reloc_offset + 4]);
-
-#ifdef CONFIG_EFI_MIXED
- update_pecoff_section_header(".compat", compat_offset, PECOFF_COMPAT_RESERVE);
-
- /*
- * Put the IA-32 machine type (0x14c) and the associated entry point
- * address in the .compat section, so loaders can figure out which other
- * execution modes this image supports.
- */
- buf[compat_offset] = 0x1;
- buf[compat_offset + 1] = 0x8;
- put_unaligned_le16(0x14c, &buf[compat_offset + 2]);
- put_unaligned_le32(efi32_pe_entry + size, &buf[compat_offset + 4]);
-#endif
-}
-
-static void update_pecoff_text(unsigned int text_start, unsigned int file_sz,
- unsigned int init_sz)
-{
- unsigned int pe_header;
- unsigned int text_sz = file_sz - text_start;
- unsigned int bss_sz = init_sz - file_sz;
-
- pe_header = get_unaligned_le32(&buf[0x3c]);
-
- /*
- * The PE/COFF loader may load the image at an address which is
- * misaligned with respect to the kernel_alignment field in the setup
- * header.
- *
- * In order to avoid relocating the kernel to correct the misalignment,
- * add slack to allow the buffer to be aligned within the declared size
- * of the image.
- */
- bss_sz += CONFIG_PHYSICAL_ALIGN;
- init_sz += CONFIG_PHYSICAL_ALIGN;
-
- /*
- * Size of code: Subtract the size of the first sector (512 bytes)
- * which includes the header.
- */
- put_unaligned_le32(file_sz - 512 + bss_sz, &buf[pe_header + 0x1c]);
-
- /* Size of image */
- put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
-
- /*
- * Address of entry point for PE/COFF executable
- */
- put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]);
-
- update_pecoff_section_header_fields(".text", text_start, text_sz + bss_sz,
- text_sz, text_start);
-}
-
-static int reserve_pecoff_reloc_section(int c)
-{
- /* Reserve 0x20 bytes for .reloc section */
- memset(buf+c, 0, PECOFF_RELOC_RESERVE);
- return PECOFF_RELOC_RESERVE;
-}
-
-static void efi_stub_defaults(void)
-{
- /* Defaults for old kernel */
-#ifdef CONFIG_X86_32
- efi_pe_entry = 0x10;
-#else
- efi_pe_entry = 0x210;
- startup_64 = 0x200;
-#endif
-}
-
-static void efi_stub_entry_update(void)
-{
- unsigned long addr = efi32_stub_entry;
-
-#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
-#ifdef CONFIG_X86_64
- /* Yes, this is really how we defined it :( */
- addr = efi64_stub_entry - 0x200;
-#endif
-
-#ifdef CONFIG_EFI_MIXED
- if (efi32_stub_entry != addr)
- die("32-bit and 64-bit EFI entry points do not match\n");
-#endif
-#endif
- put_unaligned_le32(addr, &buf[0x264]);
-}
-
-#else
-
-static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
-static inline void update_pecoff_text(unsigned int text_start,
- unsigned int file_sz,
- unsigned int init_sz) {}
-static inline void efi_stub_defaults(void) {}
-static inline void efi_stub_entry_update(void) {}
-
-static inline int reserve_pecoff_reloc_section(int c)
-{
- return 0;
-}
-#endif /* CONFIG_EFI_STUB */
-
-static int reserve_pecoff_compat_section(int c)
-{
- /* Reserve 0x20 bytes for .compat section */
- memset(buf+c, 0, PECOFF_COMPAT_RESERVE);
- return PECOFF_COMPAT_RESERVE;
-}
-
/*
* Parse zoffset.h and find the entry points. We could just #include zoffset.h
* but that would mean tools/build would have to be rebuilt every time. It's
@@ -354,14 +161,7 @@ static void parse_zoffset(char *fname)
p = (char *)buf;
while (p && *p) {
- PARSE_ZOFS(p, efi32_stub_entry);
- PARSE_ZOFS(p, efi64_stub_entry);
- PARSE_ZOFS(p, efi_pe_entry);
- PARSE_ZOFS(p, efi32_pe_entry);
- PARSE_ZOFS(p, kernel_info);
- PARSE_ZOFS(p, startup_64);
- PARSE_ZOFS(p, _ehead);
- PARSE_ZOFS(p, _end);
+ PARSE_ZOFS(p, _edata);
p = strchr(p, '\n');
while (p && (*p == '\r' || *p == '\n'))
@@ -371,17 +171,14 @@ static void parse_zoffset(char *fname)
int main(int argc, char ** argv)
{
- unsigned int i, sz, setup_sectors, init_sz;
+ unsigned int i, sz, setup_sectors;
int c;
- u32 sys_size;
struct stat sb;
FILE *file, *dest;
int fd;
void *kernel;
u32 crc = 0xffffffffUL;
- efi_stub_defaults();
-
if (argc != 5)
usage();
parse_zoffset(argv[3]);
@@ -403,72 +200,27 @@ int main(int argc, char ** argv)
die("Boot block hasn't got boot flag (0xAA55)");
fclose(file);
- c += reserve_pecoff_compat_section(c);
- c += reserve_pecoff_reloc_section(c);
-
/* Pad unused space with zeros */
- setup_sectors = (c + 511) / 512;
+ setup_sectors = (c + 4095) / 4096;
+ setup_sectors *= 8;
if (setup_sectors < SETUP_SECT_MIN)
setup_sectors = SETUP_SECT_MIN;
i = setup_sectors*512;
memset(buf+c, 0, i-c);
- update_pecoff_setup_and_reloc(i);
-
- /* Set the default root device */
- put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
-
/* Open and stat the kernel file */
fd = open(argv[2], O_RDONLY);
if (fd < 0)
die("Unable to open `%s': %m", argv[2]);
if (fstat(fd, &sb))
die("Unable to stat `%s': %m", argv[2]);
- sz = sb.st_size;
+ if (_edata != sb.st_size)
+ die("Unexpected file size `%s': %u != %u", argv[2], _edata,
+ sb.st_size);
+ sz = _edata - 4;
kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
if (kernel == MAP_FAILED)
die("Unable to mmap '%s': %m", argv[2]);
- /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
- sys_size = (sz + 15 + 4) / 16;
-#ifdef CONFIG_EFI_STUB
- /*
- * COFF requires minimum 32-byte alignment of sections, and
- * adding a signature is problematic without that alignment.
- */
- sys_size = (sys_size + 1) & ~1;
-#endif
-
- /* Patch the setup code with the appropriate size parameters */
- buf[0x1f1] = setup_sectors-1;
- put_unaligned_le32(sys_size, &buf[0x1f4]);
-
- init_sz = get_unaligned_le32(&buf[0x260]);
-#ifdef CONFIG_EFI_STUB
- /*
- * The decompression buffer will start at ImageBase. When relocating
- * the compressed kernel to its end, we must ensure that the head
- * section does not get overwritten. The head section occupies
- * [i, i + _ehead), and the destination is [init_sz - _end, init_sz).
- *
- * At present these should never overlap, because 'i' is at most 32k
- * because of SETUP_SECT_MAX, '_ehead' is less than 1k, and the
- * calculation of INIT_SIZE in boot/header.S ensures that
- * 'init_sz - _end' is at least 64k.
- *
- * For future-proofing, increase init_sz if necessary.
- */
-
- if (init_sz - _end < i + _ehead) {
- init_sz = (i + _ehead + _end + 4095) & ~4095;
- put_unaligned_le32(init_sz, &buf[0x260]);
- }
-#endif
- update_pecoff_text(setup_sectors * 512, i + (sys_size * 16), init_sz);
-
- efi_stub_entry_update();
-
- /* Update kernel_info offset. */
- put_unaligned_le32(kernel_info, &buf[0x268]);
crc = partial_crc32(buf, i, crc);
if (fwrite(buf, 1, i, dest) != i)
@@ -479,13 +231,6 @@ int main(int argc, char ** argv)
if (fwrite(kernel, 1, sz, dest) != sz)
die("Writing kernel failed");
- /* Add padding leaving 4 bytes for the checksum */
- while (sz++ < (sys_size*16) - 4) {
- crc = partial_crc32_one('\0', crc);
- if (fwrite("\0", 1, 1, dest) != 1)
- die("Writing padding failed");
- }
-
/* Write the CRC */
put_unaligned_le32(crc, buf);
if (fwrite(buf, 1, 4, dest) != 4)
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index eeec99865..ddd4efdc7 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -3,18 +3,22 @@
* Confidential Computing Platform Capability checks
*
* Copyright (C) 2021 Advanced Micro Devices, Inc.
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
#include <linux/export.h>
#include <linux/cc_platform.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <asm/archrandom.h>
#include <asm/coco.h>
#include <asm/processor.h>
enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
-static u64 cc_mask __ro_after_init;
+u64 cc_mask __ro_after_init;
static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
@@ -149,7 +153,39 @@ u64 cc_mkdec(u64 val)
}
EXPORT_SYMBOL_GPL(cc_mkdec);
-__init void cc_set_mask(u64 mask)
+__init void cc_random_init(void)
{
- cc_mask = mask;
+ /*
+ * The seed is 32 bytes (in units of longs), which is 256 bits, which
+ * is the security level that the RNG is targeting.
+ */
+ unsigned long rng_seed[32 / sizeof(long)];
+ size_t i, longs;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * Since the CoCo threat model includes the host, the only reliable
+ * source of entropy that can be neither observed nor manipulated is
+ * RDRAND. Usually, RDRAND failure is considered tolerable, but since
+ * CoCo guests have no other unobservable source of entropy, it's
+ * important to at least ensure the RNG gets some initial random seeds.
+ */
+ for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) {
+ longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i);
+
+ /*
+ * A zero return value means that the guest doesn't have RDRAND
+ * or the CPU is physically broken, and in both cases that
+ * means most crypto inside of the CoCo instance will be
+ * broken, defeating the purpose of CoCo in the first place. So
+ * just panic here because it's absolutely unsafe to continue
+ * executing.
+ */
+ if (longs == 0)
+ panic("RDRAND is defective.");
+ }
+ add_device_randomness(rng_seed, sizeof(rng_seed));
+ memzero_explicit(rng_seed, sizeof(rng_seed));
}
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index 2eca5f437..52d9786da 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -1,240 +1,63 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/asm-offsets.h>
#include <asm/asm.h>
-#include <asm/frame.h>
-#include <asm/unwind_hints.h>
#include <linux/linkage.h>
-#include <linux/bits.h>
#include <linux/errno.h>
#include "../../virt/vmx/tdx/tdxcall.S"
-/*
- * Bitmasks of exposed registers (with VMM).
- */
-#define TDX_RDX BIT(2)
-#define TDX_RBX BIT(3)
-#define TDX_RSI BIT(6)
-#define TDX_RDI BIT(7)
-#define TDX_R8 BIT(8)
-#define TDX_R9 BIT(9)
-#define TDX_R10 BIT(10)
-#define TDX_R11 BIT(11)
-#define TDX_R12 BIT(12)
-#define TDX_R13 BIT(13)
-#define TDX_R14 BIT(14)
-#define TDX_R15 BIT(15)
-
-/*
- * These registers are clobbered to hold arguments for each
- * TDVMCALL. They are safe to expose to the VMM.
- * Each bit in this mask represents a register ID. Bit field
- * details can be found in TDX GHCI specification, section
- * titled "TDCALL [TDG.VP.VMCALL] leaf".
- */
-#define TDVMCALL_EXPOSE_REGS_MASK \
- ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
- TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 )
-
.section .noinstr.text, "ax"
/*
- * __tdx_module_call() - Used by TDX guests to request services from
- * the TDX module (does not include VMM services) using TDCALL instruction.
- *
- * Transforms function call register arguments into the TDCALL register ABI.
- * After TDCALL operation, TDX module output is saved in @out (if it is
- * provided by the user).
- *
- *-------------------------------------------------------------------------
- * TDCALL ABI:
- *-------------------------------------------------------------------------
- * Input Registers:
- *
- * RAX - TDCALL Leaf number.
- * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers.
- *
- * Output Registers:
+ * __tdcall() - Used by TDX guests to request services from the TDX
+ * module (does not include VMM services) using TDCALL instruction.
*
- * RAX - TDCALL instruction error code.
- * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers.
+ * __tdcall() function ABI:
*
- *-------------------------------------------------------------------------
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input
*
- * __tdx_module_call() function ABI:
- *
- * @fn (RDI) - TDCALL Leaf ID, moved to RAX
- * @rcx (RSI) - Input parameter 1, moved to RCX
- * @rdx (RDX) - Input parameter 2, moved to RDX
- * @r8 (RCX) - Input parameter 3, moved to R8
- * @r9 (R8) - Input parameter 4, moved to R9
- *
- * @out (R9) - struct tdx_module_output pointer
- * stored temporarily in R12 (not
- * shared with the TDX module). It
- * can be NULL.
+ * Only RCX/RDX/R8-R11 are used as input registers.
*
* Return status of TDCALL via RAX.
*/
-SYM_FUNC_START(__tdx_module_call)
- FRAME_BEGIN
+SYM_FUNC_START(__tdcall)
TDX_MODULE_CALL host=0
- FRAME_END
- RET
-SYM_FUNC_END(__tdx_module_call)
+SYM_FUNC_END(__tdcall)
/*
- * TDX_HYPERCALL - Make hypercalls to a TDX VMM using TDVMCALL leaf of TDCALL
- * instruction
- *
- * Transforms values in function call argument struct tdx_hypercall_args @args
- * into the TDCALL register ABI. After TDCALL operation, VMM output is saved
- * back in @args, if \ret is 1.
- *
- *-------------------------------------------------------------------------
- * TD VMCALL ABI:
- *-------------------------------------------------------------------------
- *
- * Input Registers:
+ * __tdcall_ret() - Used by TDX guests to request services from the TDX
+ * module (does not include VMM services) using TDCALL instruction, with
+ * saving output registers to the 'struct tdx_module_args' used as input.
*
- * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL)
- * RCX - BITMAP which controls which part of TD Guest GPR
- * is passed as-is to the VMM and back.
- * R10 - Set 0 to indicate TDCALL follows standard TDX ABI
- * specification. Non zero value indicates vendor
- * specific ABI.
- * R11 - VMCALL sub function number
- * RBX, RDX, RDI, RSI - Used to pass VMCALL sub function specific arguments.
- * R8-R9, R12-R15 - Same as above.
+ * __tdcall_ret() function ABI:
*
- * Output Registers:
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input and output
*
- * RAX - TDCALL instruction status (Not related to hypercall
- * output).
- * RBX, RDX, RDI, RSI - Hypercall sub function specific output values.
- * R8-R15 - Same as above.
+ * Only RCX/RDX/R8-R11 are used as input/output registers.
*
+ * Return status of TDCALL via RAX.
*/
-.macro TDX_HYPERCALL ret:req
- FRAME_BEGIN
-
- /* Save callee-saved GPRs as mandated by the x86_64 ABI */
- push %r15
- push %r14
- push %r13
- push %r12
- push %rbx
-
- /* Free RDI to be used as TDVMCALL arguments */
- movq %rdi, %rax
-
- /* Copy hypercall registers from arg struct: */
- movq TDX_HYPERCALL_r8(%rax), %r8
- movq TDX_HYPERCALL_r9(%rax), %r9
- movq TDX_HYPERCALL_r10(%rax), %r10
- movq TDX_HYPERCALL_r11(%rax), %r11
- movq TDX_HYPERCALL_r12(%rax), %r12
- movq TDX_HYPERCALL_r13(%rax), %r13
- movq TDX_HYPERCALL_r14(%rax), %r14
- movq TDX_HYPERCALL_r15(%rax), %r15
- movq TDX_HYPERCALL_rdi(%rax), %rdi
- movq TDX_HYPERCALL_rsi(%rax), %rsi
- movq TDX_HYPERCALL_rbx(%rax), %rbx
- movq TDX_HYPERCALL_rdx(%rax), %rdx
-
- push %rax
-
- /* Mangle function call ABI into TDCALL ABI: */
- /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
- xor %eax, %eax
-
- movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
-
- tdcall
-
- /*
- * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that
- * something has gone horribly wrong with the TDX module.
- *
- * The return status of the hypercall operation is in a separate
- * register (in R10). Hypercall errors are a part of normal operation
- * and are handled by callers.
- */
- testq %rax, %rax
- jne .Lpanic\@
-
- pop %rax
-
- .if \ret
- movq %r8, TDX_HYPERCALL_r8(%rax)
- movq %r9, TDX_HYPERCALL_r9(%rax)
- movq %r10, TDX_HYPERCALL_r10(%rax)
- movq %r11, TDX_HYPERCALL_r11(%rax)
- movq %r12, TDX_HYPERCALL_r12(%rax)
- movq %r13, TDX_HYPERCALL_r13(%rax)
- movq %r14, TDX_HYPERCALL_r14(%rax)
- movq %r15, TDX_HYPERCALL_r15(%rax)
- movq %rdi, TDX_HYPERCALL_rdi(%rax)
- movq %rsi, TDX_HYPERCALL_rsi(%rax)
- movq %rbx, TDX_HYPERCALL_rbx(%rax)
- movq %rdx, TDX_HYPERCALL_rdx(%rax)
- .endif
-
- /* TDVMCALL leaf return code is in R10 */
- movq %r10, %rax
-
- /*
- * Zero out registers exposed to the VMM to avoid speculative execution
- * with VMM-controlled values. This needs to include all registers
- * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which
- * will be restored.
- */
- xor %r8d, %r8d
- xor %r9d, %r9d
- xor %r10d, %r10d
- xor %r11d, %r11d
- xor %rdi, %rdi
- xor %rsi, %rsi
- xor %rdx, %rdx
-
- /* Restore callee-saved GPRs as mandated by the x86_64 ABI */
- pop %rbx
- pop %r12
- pop %r13
- pop %r14
- pop %r15
-
- FRAME_END
-
- RET
-.Lpanic\@:
- call __tdx_hypercall_failed
- /* __tdx_hypercall_failed never returns */
- REACHABLE
- jmp .Lpanic\@
-.endm
+SYM_FUNC_START(__tdcall_ret)
+ TDX_MODULE_CALL host=0 ret=1
+SYM_FUNC_END(__tdcall_ret)
/*
+ * __tdcall_saved_ret() - Used by TDX guests to request services from the
+ * TDX module (including VMM services) using TDCALL instruction, with
+ * saving output registers to the 'struct tdx_module_args' used as input.
*
- * __tdx_hypercall() function ABI:
- *
- * @args (RDI) - struct tdx_hypercall_args for input
- *
- * On successful completion, return the hypercall error code.
- */
-SYM_FUNC_START(__tdx_hypercall)
- TDX_HYPERCALL ret=0
-SYM_FUNC_END(__tdx_hypercall)
-
-/*
+ * __tdcall_saved_ret() function ABI:
*
- * __tdx_hypercall_ret() function ABI:
+ * @fn (RDI) - TDCALL leaf ID, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input/output
*
- * @args (RDI) - struct tdx_hypercall_args for input and output
+ * All registers in @args are used as input/output registers.
*
* On successful completion, return the hypercall error code.
*/
-SYM_FUNC_START(__tdx_hypercall_ret)
- TDX_HYPERCALL ret=1
-SYM_FUNC_END(__tdx_hypercall_ret)
+SYM_FUNC_START(__tdcall_saved_ret)
+ TDX_MODULE_CALL host=0 ret=1 saved=1
+SYM_FUNC_END(__tdcall_saved_ret)
diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c
index ef20ddc37..1655aa56a 100644
--- a/arch/x86/coco/tdx/tdx-shared.c
+++ b/arch/x86/coco/tdx/tdx-shared.c
@@ -5,7 +5,7 @@ static unsigned long try_accept_one(phys_addr_t start, unsigned long len,
enum pg_level pg_level)
{
unsigned long accept_size = page_level_size(pg_level);
- u64 tdcall_rcx;
+ struct tdx_module_args args = {};
u8 page_size;
if (!IS_ALIGNED(start, accept_size))
@@ -22,20 +22,20 @@ static unsigned long try_accept_one(phys_addr_t start, unsigned long len,
*/
switch (pg_level) {
case PG_LEVEL_4K:
- page_size = 0;
+ page_size = TDX_PS_4K;
break;
case PG_LEVEL_2M:
- page_size = 1;
+ page_size = TDX_PS_2M;
break;
case PG_LEVEL_1G:
- page_size = 2;
+ page_size = TDX_PS_1G;
break;
default:
return 0;
}
- tdcall_rcx = start | page_size;
- if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
+ args.rcx = start | page_size;
+ if (__tdcall(TDG_MEM_PAGE_ACCEPT, &args))
return 0;
return accept_size;
@@ -45,7 +45,7 @@ bool tdx_accept_memory(phys_addr_t start, phys_addr_t end)
{
/*
* For shared->private conversion, accept the page using
- * TDX_ACCEPT_PAGE TDX module call.
+ * TDG_MEM_PAGE_ACCEPT TDX module call.
*/
while (start < end) {
unsigned long len = end - start;
@@ -69,3 +69,23 @@ bool tdx_accept_memory(phys_addr_t start, phys_addr_t end)
return true;
}
+
+noinstr u64 __tdx_hypercall(struct tdx_module_args *args)
+{
+ /*
+ * For TDVMCALL explicitly set RCX to the bitmap of shared registers.
+ * The caller isn't expected to set @args->rcx anyway.
+ */
+ args->rcx = TDVMCALL_EXPOSE_REGS_MASK;
+
+ /*
+ * Failure of __tdcall_saved_ret() indicates a failure of the TDVMCALL
+ * mechanism itself and that something has gone horribly wrong with
+ * the TDX module. __tdx_hypercall_failed() never returns.
+ */
+ if (__tdcall_saved_ret(TDG_VP_VMCALL, args))
+ __tdx_hypercall_failed();
+
+ /* TDVMCALL leaf return code is in R10 */
+ return args->r10;
+}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index f3c75809f..c1cb90369 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -39,7 +39,7 @@
#define TDREPORT_SUBTYPE_0 0
/* Called from __tdx_hypercall() for unrecoverable failure */
-noinstr void __tdx_hypercall_failed(void)
+noinstr void __noreturn __tdx_hypercall_failed(void)
{
instrumentation_begin();
panic("TDVMCALL failed. TDX module bug?");
@@ -49,7 +49,7 @@ noinstr void __tdx_hypercall_failed(void)
long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
unsigned long p3, unsigned long p4)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = nr,
.r11 = p1,
.r12 = p2,
@@ -67,10 +67,9 @@ EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
* should only be used for calls that have no legitimate reason to fail
* or where the kernel can not survive the call failing.
*/
-static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
- struct tdx_module_output *out)
+static inline void tdcall(u64 fn, struct tdx_module_args *args)
{
- if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
+ if (__tdcall_ret(fn, args))
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
@@ -90,11 +89,14 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
*/
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
{
+ struct tdx_module_args args = {
+ .rcx = virt_to_phys(tdreport),
+ .rdx = virt_to_phys(reportdata),
+ .r8 = TDREPORT_SUBTYPE_0,
+ };
u64 ret;
- ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
- virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
- 0, NULL);
+ ret = __tdcall(TDG_MR_REPORT, &args);
if (ret) {
if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
return -EINVAL;
@@ -105,9 +107,30 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
}
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+/**
+ * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
+ * hypercall.
+ * @buf: Address of the directly mapped shared kernel buffer which
+ * contains TDREPORT. The same buffer will be used by VMM to
+ * store the generated TD Quote output.
+ * @size: size of the tdquote buffer (4KB-aligned).
+ *
+ * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
+ * v1.0 specification for more information on GetQuote hypercall.
+ * It is used in the TDX guest driver module to get the TD Quote.
+ *
+ * Return 0 on success or error code on failure.
+ */
+u64 tdx_hcall_get_quote(u8 *buf, size_t size)
+{
+ /* Since buf is a shared memory, set the shared (decrypted) bits */
+ return _tdx_hypercall(TDVMCALL_GET_QUOTE, cc_mkdec(virt_to_phys(buf)), size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
+
static void __noreturn tdx_panic(const char *msg)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = TDVMCALL_REPORT_FATAL_ERROR,
.r12 = 0, /* Error code: 0 is Panic */
@@ -120,7 +143,7 @@ static void __noreturn tdx_panic(const char *msg)
} message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
- strncpy(message.str, msg, 64);
+ strtomem_pad(message.str, msg, '\0');
args.r8 = message.r8;
args.r9 = message.r9;
@@ -142,7 +165,7 @@ static void __noreturn tdx_panic(const char *msg)
static void tdx_parse_tdinfo(u64 *cc_mask)
{
- struct tdx_module_output out;
+ struct tdx_module_args args = {};
unsigned int gpa_width;
u64 td_attr;
@@ -153,7 +176,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
* [TDG.VP.INFO].
*/
- tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+ tdcall(TDG_VP_INFO, &args);
/*
* The highest bit of a guest physical address is the "sharing" bit.
@@ -162,7 +185,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* The GPA width that comes out of this call is critical. TDX guests
* can not meaningfully run without it.
*/
- gpa_width = out.rcx & GENMASK(5, 0);
+ gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
/*
@@ -170,7 +193,7 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* memory. Ensure that no #VE will be delivered for accesses to
* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
- td_attr = out.rdx;
+ td_attr = args.rdx;
if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
@@ -229,7 +252,7 @@ static int ve_instr_len(struct ve_info *ve)
static u64 __cpuidle __halt(const bool irq_disabled)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_HLT),
.r12 = irq_disabled,
@@ -273,7 +296,7 @@ void __cpuidle tdx_safe_halt(void)
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_READ),
.r12 = regs->cx,
@@ -284,7 +307,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return -EIO;
regs->ax = lower_32_bits(args.r11);
@@ -294,7 +317,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
.r12 = regs->cx,
@@ -314,7 +337,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve)
static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_CPUID),
.r12 = regs->ax,
@@ -338,7 +361,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
* ABI can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return -EIO;
/*
@@ -356,7 +379,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
static bool mmio_read(int size, unsigned long addr, unsigned long *val)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
.r12 = size,
@@ -365,8 +388,9 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r15 = *val,
};
- if (__tdx_hypercall_ret(&args))
+ if (__tdx_hypercall(&args))
return false;
+
*val = args.r11;
return true;
}
@@ -484,7 +508,7 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
static bool handle_in(struct pt_regs *regs, int size, int port)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
.r12 = size,
@@ -499,7 +523,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
* in TDX Guest-Host-Communication Interface (GHCI) section titled
* "TDG.VP.VMCALL<Instruction.IO>".
*/
- success = !__tdx_hypercall_ret(&args);
+ success = !__tdx_hypercall(&args);
/* Update part of the register affected by the emulated instruction */
regs->ax &= ~mask;
@@ -578,7 +602,7 @@ __init bool tdx_early_handle_ve(struct pt_regs *regs)
void tdx_get_ve_info(struct ve_info *ve)
{
- struct tdx_module_output out;
+ struct tdx_module_args args = {};
/*
* Called during #VE handling to retrieve the #VE info from the
@@ -595,15 +619,15 @@ void tdx_get_ve_info(struct ve_info *ve)
* Note, the TDX module treats virtual NMIs as inhibited if the #VE
* valid flag is set. It means that NMI=>#VE will not result in a #DF.
*/
- tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
+ tdcall(TDG_VP_VEINFO_GET, &args);
/* Transfer the output parameters */
- ve->exit_reason = out.rcx;
- ve->exit_qual = out.rdx;
- ve->gla = out.r8;
- ve->gpa = out.r9;
- ve->instr_len = lower_32_bits(out.r10);
- ve->instr_info = upper_32_bits(out.r10);
+ ve->exit_reason = args.rcx;
+ ve->exit_qual = args.rdx;
+ ve->gla = args.r8;
+ ve->gpa = args.r9;
+ ve->instr_len = lower_32_bits(args.r10);
+ ve->instr_info = upper_32_bits(args.r10);
}
/*
@@ -704,14 +728,15 @@ static bool tdx_cache_flush_required(void)
}
/*
- * Inform the VMM of the guest's intent for this physical page: shared with
- * the VMM or private to the guest. The VMM is expected to change its mapping
- * of the page in response.
+ * Notify the VMM about page mapping conversion. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface (GHCI),
+ * section "TDG.VP.VMCALL<MapGPA>".
*/
-static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
{
- phys_addr_t start = __pa(vaddr);
- phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+ /* Retrying the hypercall a second time should succeed; use 3 just in case */
+ const int max_retries_per_page = 3;
+ int retry_count = 0;
if (!enc) {
/* Set the shared (decrypted) bits: */
@@ -719,12 +744,51 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
end |= cc_mkdec(0);
}
- /*
- * Notify the VMM about page mapping conversion. More info about ABI
- * can be found in TDX Guest-Host-Communication Interface (GHCI),
- * section "TDG.VP.VMCALL<MapGPA>"
- */
- if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
+ while (retry_count < max_retries_per_page) {
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_MAP_GPA,
+ .r12 = start,
+ .r13 = end - start };
+
+ u64 map_fail_paddr;
+ u64 ret = __tdx_hypercall(&args);
+
+ if (ret != TDVMCALL_STATUS_RETRY)
+ return !ret;
+ /*
+ * The guest must retry the operation for the pages in the
+ * region starting at the GPA specified in R11. R11 comes
+ * from the untrusted VMM. Sanity check it.
+ */
+ map_fail_paddr = args.r11;
+ if (map_fail_paddr < start || map_fail_paddr >= end)
+ return false;
+
+ /* "Consume" a retry without forward progress */
+ if (map_fail_paddr == start) {
+ retry_count++;
+ continue;
+ }
+
+ start = map_fail_paddr;
+ retry_count = 0;
+ }
+
+ return false;
+}
+
+/*
+ * Inform the VMM of the guest's intent for this physical page: shared with
+ * the VMM or private to the guest. The VMM is expected to change its mapping
+ * of the page in response.
+ */
+static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+{
+ phys_addr_t start = __pa(vaddr);
+ phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+
+ if (!tdx_map_gpa(start, end, enc))
return false;
/* shared->private conversion requires memory to be accepted before use */
@@ -760,6 +824,10 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
void __init tdx_early_init(void)
{
+ struct tdx_module_args args = {
+ .rdx = TDCS_NOTIFY_ENABLES,
+ .r9 = -1ULL,
+ };
u64 cc_mask;
u32 eax, sig[3];
@@ -770,12 +838,15 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+ /* TSC is the only reliable clock in TDX guest */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
cc_vendor = CC_VENDOR_INTEL;
tdx_parse_tdinfo(&cc_mask);
cc_set_mask(cc_mask);
/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+ tdcall(TDG_VM_WR, &args);
/*
* All bits above GPA width are reserved and kernel treats shared bit
@@ -816,7 +887,7 @@ void __init tdx_early_init(void)
* there.
*
* Intel-TDX has a secure RDMSR hypercall, but that needs to be
- * implemented seperately in the low level startup ASM code.
+ * implemented separately in the low level startup ASM code.
* Until that is in place, disable parallel bringup for TDX.
*/
x86_cpuinit.parallel_bringup = false;
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
new file mode 100644
index 000000000..7b497f3b7
--- /dev/null
+++ b/arch/x86/configs/hardening.config
@@ -0,0 +1,14 @@
+# Basic kernel hardening options (specific to x86)
+
+# Modern libc no longer needs a fixed-position mapping in userspace, remove
+# it as a possible target.
+CONFIG_LEGACY_VSYSCALL_NONE=y
+
+# Enable chip-specific IOMMU support.
+CONFIG_INTEL_IOMMU=y
+CONFIG_INTEL_IOMMU_DEFAULT_ON=y
+CONFIG_INTEL_IOMMU_SVM=y
+CONFIG_AMD_IOMMU=y
+
+# Enable CET Shadow Stack for userspace.
+CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 1b411bbf3..73abbbdd2 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -281,4 +281,5 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_UNWINDER_FRAME_POINTER=y
+CONFIG_DEBUG_ENTRY=y
# CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 409e9182b..61e25f620 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -276,3 +276,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_DEBUG_ENTRY=y
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 9bbfd01cf..c9e59589a 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -189,7 +189,7 @@ config CRYPTO_SERPENT_AVX2_X86_64
Processes 16 blocks in parallel.
config CRYPTO_SM4_AESNI_AVX_X86_64
- tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX)"
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)"
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
@@ -197,7 +197,7 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
select CRYPTO_SM4
help
Length-preserving ciphers: SM4 cipher algorithms
- (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
@@ -210,7 +210,7 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
If unsure, say N.
config CRYPTO_SM4_AESNI_AVX2_X86_64
- tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX2)"
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)"
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
@@ -219,7 +219,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64
select CRYPTO_SM4_AESNI_AVX_X86_64
help
Length-preserving ciphers: SM4 cipher algorithms
- (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3ac7487ec..411d8c83e 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -666,13 +666,13 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.ifc \operation, dec
movdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+ pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
- # Determine if if partial block is not being filled and
+ # Determine if partial block is not being filled and
# shift mask accordingly
jge .L_no_extra_mask_1_\@
sub %r10, %r12
@@ -708,7 +708,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
- # Determine if if partial block is not being filled and
+ # Determine if partial block is not being filled and
# shift mask accordingly
jge .L_no_extra_mask_2_\@
sub %r10, %r12
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 46cddd788..8c9749ed0 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -747,13 +747,13 @@ VARIABLE_OFFSET = 16*8
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+ pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
- # Determine if if partial block is not being filled and
+ # Determine if partial block is not being filled and
# shift mask accordingly
jge .L_no_extra_mask_1_\@
sub %r10, %r12
@@ -789,7 +789,7 @@ VARIABLE_OFFSET = 16*8
add %r13, %r10
# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
sub $16, %r10
- # Determine if if partial block is not being filled and
+ # Determine if partial block is not being filled and
# shift mask accordingly
jge .L_no_extra_mask_2_\@
sub %r10, %r12
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 39d6a62ac..b1d90c259 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -61,8 +61,8 @@ struct generic_gcmaes_ctx {
};
struct aesni_xts_ctx {
- u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
- u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
};
#define GCM_BLOCK_LEN 16
@@ -80,6 +80,13 @@ struct gcm_context_data {
u8 hash_keys[GCM_BLOCK_LEN * 16];
};
+static inline void *aes_align_addr(void *addr)
+{
+ if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
+ return addr;
+ return PTR_ALIGN(addr, AESNI_ALIGN);
+}
+
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len);
asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
@@ -201,32 +208,24 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
- unsigned long align = AESNI_ALIGN;
-
- if (align <= crypto_tfm_ctx_alignment())
- align = 1;
- return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+ return aes_align_addr(crypto_aead_ctx(tfm));
}
static inline struct
generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
{
- unsigned long align = AESNI_ALIGN;
-
- if (align <= crypto_tfm_ctx_alignment())
- align = 1;
- return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+ return aes_align_addr(crypto_aead_ctx(tfm));
}
#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
- unsigned long addr = (unsigned long)raw_ctx;
- unsigned long align = AESNI_ALIGN;
+ return aes_align_addr(raw_ctx);
+}
- if (align <= crypto_tfm_ctx_alignment())
- align = 1;
- return (struct crypto_aes_ctx *)ALIGN(addr, align);
+static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm)
+{
+ return aes_align_addr(crypto_skcipher_ctx(tfm));
}
static int aes_set_key_common(struct crypto_aes_ctx *ctx,
@@ -881,7 +880,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
@@ -891,19 +890,18 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
keylen /= 2;
/* first half of xts-key is for crypt */
- err = aes_set_key_common(aes_ctx(ctx->raw_crypt_ctx), key, keylen);
+ err = aes_set_key_common(&ctx->crypt_ctx, key, keylen);
if (err)
return err;
/* second half of xts-key is for tweak */
- return aes_set_key_common(aes_ctx(ctx->raw_tweak_ctx), key + keylen,
- keylen);
+ return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
}
static int xts_crypt(struct skcipher_request *req, bool encrypt)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct skcipher_request subreq;
struct skcipher_walk walk;
@@ -939,7 +937,7 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
kernel_fpu_begin();
/* calculate first value of T */
- aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv);
+ aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv);
while (walk.nbytes > 0) {
int nbytes = walk.nbytes;
@@ -948,11 +946,11 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
nbytes &= ~(AES_BLOCK_SIZE - 1);
if (encrypt)
- aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ aesni_xts_encrypt(&ctx->crypt_ctx,
walk.dst.virt.addr, walk.src.virt.addr,
nbytes, walk.iv);
else
- aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ aesni_xts_decrypt(&ctx->crypt_ctx,
walk.dst.virt.addr, walk.src.virt.addr,
nbytes, walk.iv);
kernel_fpu_end();
@@ -980,11 +978,11 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt)
kernel_fpu_begin();
if (encrypt)
- aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ aesni_xts_encrypt(&ctx->crypt_ctx,
walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes, walk.iv);
else
- aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ aesni_xts_decrypt(&ctx->crypt_ctx,
walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes, walk.iv);
kernel_fpu_end();
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 81ce0f4db..bbcff1fb7 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -184,7 +184,7 @@ SYM_FUNC_START(crc_pcl)
xor crc1,crc1
xor crc2,crc2
- # Fall thruogh into top of crc array (crc_128)
+ # Fall through into top of crc array (crc_128)
################################################################
## 3) CRC Array:
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index ef73a3ab8..791386d9a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
+ vzeroupper
RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 46b036204..c3a872f4d 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -34,6 +34,14 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
return 0;
}
+static int nhpoly1305_avx2_digest(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen, u8 *out)
+{
+ return crypto_nhpoly1305_init(desc) ?:
+ nhpoly1305_avx2_update(desc, src, srclen) ?:
+ crypto_nhpoly1305_final(desc, out);
+}
+
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-avx2",
@@ -44,6 +52,7 @@ static struct shash_alg nhpoly1305_alg = {
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_avx2_update,
.final = crypto_nhpoly1305_final,
+ .digest = nhpoly1305_avx2_digest,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 4a4970d75..a268a8439 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -34,6 +34,14 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
return 0;
}
+static int nhpoly1305_sse2_digest(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen, u8 *out)
+{
+ return crypto_nhpoly1305_init(desc) ?:
+ nhpoly1305_sse2_update(desc, src, srclen) ?:
+ crypto_nhpoly1305_final(desc, out);
+}
+
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-sse2",
@@ -44,6 +52,7 @@ static struct shash_alg nhpoly1305_alg = {
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_sse2_update,
.final = crypto_nhpoly1305_final,
+ .digest = nhpoly1305_sse2_digest,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 959afa705..ab8bc54f2 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -2,8 +2,8 @@
/*
* Cryptographic API.
*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * Supplemental SSE3 instructions.
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
+ * using SSSE3, AVX, AVX2, and SHA-NI instructions.
*
* This file is based on sha1_generic.c
*
@@ -28,6 +28,9 @@
#include <asm/simd.h>
static const struct x86_cpu_id module_cpu_ids[] = {
+#ifdef CONFIG_AS_SHA1_NI
+ X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
+#endif
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9918212fa..0ffb072be 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -716,6 +716,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
popq %r13
popq %r12
popq %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha256_transform_rorx)
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index d25235f0c..e04a43d9f 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -1,8 +1,8 @@
/*
* Cryptographic API.
*
- * Glue code for the SHA256 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ * Glue code for the SHA256 Secure Hash Algorithm assembler implementations
+ * using SSSE3, AVX, AVX2, and SHA-NI instructions.
*
* This file is based on sha256_generic.c
*
@@ -45,6 +45,9 @@ asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
const u8 *data, int blocks);
static const struct x86_cpu_id module_cpu_ids[] = {
+#ifdef CONFIG_AS_SHA256_NI
+ X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
+#endif
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
@@ -107,12 +110,20 @@ static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
return sha256_ssse3_finup(desc, NULL, 0, out);
}
+static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_base_init(desc) ?:
+ sha256_ssse3_finup(desc, data, len, out);
+}
+
static struct shash_alg sha256_ssse3_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ssse3_update,
.final = sha256_ssse3_final,
.finup = sha256_ssse3_finup,
+ .digest = sha256_ssse3_digest,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
@@ -172,12 +183,20 @@ static int sha256_avx_final(struct shash_desc *desc, u8 *out)
return sha256_avx_finup(desc, NULL, 0, out);
}
+static int sha256_avx_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_base_init(desc) ?:
+ sha256_avx_finup(desc, data, len, out);
+}
+
static struct shash_alg sha256_avx_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_avx_update,
.final = sha256_avx_final,
.finup = sha256_avx_finup,
+ .digest = sha256_avx_digest,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
@@ -248,12 +267,20 @@ static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
return sha256_avx2_finup(desc, NULL, 0, out);
}
+static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_base_init(desc) ?:
+ sha256_avx2_finup(desc, data, len, out);
+}
+
static struct shash_alg sha256_avx2_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_avx2_update,
.final = sha256_avx2_final,
.finup = sha256_avx2_finup,
+ .digest = sha256_avx2_digest,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
@@ -323,12 +350,20 @@ static int sha256_ni_final(struct shash_desc *desc, u8 *out)
return sha256_ni_finup(desc, NULL, 0, out);
}
+static int sha256_ni_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_base_init(desc) ?:
+ sha256_ni_finup(desc, data, len, out);
+}
+
static struct shash_alg sha256_ni_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ni_update,
.final = sha256_ni_final,
.finup = sha256_ni_finup,
+ .digest = sha256_ni_digest,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index d902b8ea0..5bfce4b04 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -84,7 +84,7 @@ frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
# Input message (arg1)
#define MSG(i) 8*i(msg)
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index f08496cd6..24973f42c 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx)
pop %r12
pop %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha512_transform_rorx)
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 65be30156..30a2c4777 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -82,7 +82,7 @@ frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
# Input message (arg1)
#define MSG(i) 8*i(msg)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index e2668d2fe..2bf611eaa 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -534,55 +534,3 @@ SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
FRAME_END
RET;
SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
-
-/*
- * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
- * const u8 *src, u8 *iv)
- */
-SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
- /* input:
- * %rdi: round key array, CTX
- * %rsi: dst (8 blocks)
- * %rdx: src (8 blocks)
- * %rcx: iv
- */
- FRAME_BEGIN
-
- /* Load input */
- vmovdqu (%rcx), RA0;
- vmovdqu 0 * 16(%rdx), RA1;
- vmovdqu 1 * 16(%rdx), RA2;
- vmovdqu 2 * 16(%rdx), RA3;
- vmovdqu 3 * 16(%rdx), RB0;
- vmovdqu 4 * 16(%rdx), RB1;
- vmovdqu 5 * 16(%rdx), RB2;
- vmovdqu 6 * 16(%rdx), RB3;
-
- /* Update IV */
- vmovdqu 7 * 16(%rdx), RNOT;
- vmovdqu RNOT, (%rcx);
-
- call __sm4_crypt_blk8;
-
- vpxor (0 * 16)(%rdx), RA0, RA0;
- vpxor (1 * 16)(%rdx), RA1, RA1;
- vpxor (2 * 16)(%rdx), RA2, RA2;
- vpxor (3 * 16)(%rdx), RA3, RA3;
- vpxor (4 * 16)(%rdx), RB0, RB0;
- vpxor (5 * 16)(%rdx), RB1, RB1;
- vpxor (6 * 16)(%rdx), RB2, RB2;
- vpxor (7 * 16)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 16)(%rsi);
- vmovdqu RA1, (1 * 16)(%rsi);
- vmovdqu RA2, (2 * 16)(%rsi);
- vmovdqu RA3, (3 * 16)(%rsi);
- vmovdqu RB0, (4 * 16)(%rsi);
- vmovdqu RB1, (5 * 16)(%rsi);
- vmovdqu RB2, (6 * 16)(%rsi);
- vmovdqu RB3, (7 * 16)(%rsi);
-
- vzeroall;
- FRAME_END
- RET;
-SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index 98ede9459..9ff5ba075 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -439,58 +439,3 @@ SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
FRAME_END
RET;
SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
-
-/*
- * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
- * const u8 *src, u8 *iv)
- */
-SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
- /* input:
- * %rdi: round key array, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- /* Load input */
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RA0;
- vmovdqu (0 * 32 + 16)(%rdx), RA1;
- vmovdqu (1 * 32 + 16)(%rdx), RA2;
- vmovdqu (2 * 32 + 16)(%rdx), RA3;
- vmovdqu (3 * 32 + 16)(%rdx), RB0;
- vmovdqu (4 * 32 + 16)(%rdx), RB1;
- vmovdqu (5 * 32 + 16)(%rdx), RB2;
- vmovdqu (6 * 32 + 16)(%rdx), RB3;
-
- /* Update IV */
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx);
-
- call __sm4_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
- FRAME_END
- RET;
-SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h
index 1bceab751..b5b5e67e4 100644
--- a/arch/x86/crypto/sm4-avx.h
+++ b/arch/x86/crypto/sm4-avx.h
@@ -14,10 +14,6 @@ int sm4_cbc_encrypt(struct skcipher_request *req);
int sm4_avx_cbc_decrypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func);
-int sm4_cfb_encrypt(struct skcipher_request *req);
-int sm4_avx_cfb_decrypt(struct skcipher_request *req,
- unsigned int bsize, sm4_crypt_func func);
-
int sm4_avx_ctr_crypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func);
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
index 84bc718f4..1148fd4cd 100644
--- a/arch/x86/crypto/sm4_aesni_avx2_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -23,8 +23,6 @@ asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
-asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
- const u8 *src, u8 *iv);
static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -41,12 +39,6 @@ static int cbc_decrypt(struct skcipher_request *req)
}
-static int cfb_decrypt(struct skcipher_request *req)
-{
- return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
- sm4_aesni_avx2_cfb_dec_blk16);
-}
-
static int ctr_crypt(struct skcipher_request *req)
{
return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
@@ -89,24 +81,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__cfb(sm4)",
- .cra_driver_name = "__cfb-sm4-aesni-avx2",
- .cra_priority = 500,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct sm4_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = SM4_KEY_SIZE,
- .max_keysize = SM4_KEY_SIZE,
- .ivsize = SM4_BLOCK_SIZE,
- .chunksize = SM4_BLOCK_SIZE,
- .walksize = 16 * SM4_BLOCK_SIZE,
- .setkey = sm4_skcipher_setkey,
- .encrypt = sm4_cfb_encrypt,
- .decrypt = cfb_decrypt,
- }, {
- .base = {
.cra_name = "__ctr(sm4)",
.cra_driver_name = "__ctr-sm4-aesni-avx2",
.cra_priority = 500,
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 7800f77d6..85b4ca78b 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -27,8 +27,6 @@ asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
-asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
- const u8 *src, u8 *iv);
static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -188,116 +186,6 @@ static int cbc_decrypt(struct skcipher_request *req)
sm4_aesni_avx_cbc_dec_blk8);
}
-int sm4_cfb_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) > 0) {
- u8 keystream[SM4_BLOCK_SIZE];
- const u8 *iv = walk.iv;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
-
- while (nbytes >= SM4_BLOCK_SIZE) {
- sm4_crypt_block(ctx->rkey_enc, keystream, iv);
- crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
- iv = dst;
- src += SM4_BLOCK_SIZE;
- dst += SM4_BLOCK_SIZE;
- nbytes -= SM4_BLOCK_SIZE;
- }
- if (iv != walk.iv)
- memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
-
- /* tail */
- if (walk.nbytes == walk.total && nbytes > 0) {
- sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
- crypto_xor_cpy(dst, src, keystream, nbytes);
- nbytes = 0;
- }
-
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(sm4_cfb_encrypt);
-
-int sm4_avx_cfb_decrypt(struct skcipher_request *req,
- unsigned int bsize, sm4_crypt_func func)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) > 0) {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
-
- kernel_fpu_begin();
-
- while (nbytes >= bsize) {
- func(ctx->rkey_enc, dst, src, walk.iv);
- dst += bsize;
- src += bsize;
- nbytes -= bsize;
- }
-
- while (nbytes >= SM4_BLOCK_SIZE) {
- u8 keystream[SM4_BLOCK_SIZE * 8];
- unsigned int nblocks = min(nbytes >> 4, 8u);
-
- memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
- if (nblocks > 1)
- memcpy(&keystream[SM4_BLOCK_SIZE], src,
- (nblocks - 1) * SM4_BLOCK_SIZE);
- memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE,
- SM4_BLOCK_SIZE);
-
- sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
- keystream, nblocks);
-
- crypto_xor_cpy(dst, src, keystream,
- nblocks * SM4_BLOCK_SIZE);
- dst += nblocks * SM4_BLOCK_SIZE;
- src += nblocks * SM4_BLOCK_SIZE;
- nbytes -= nblocks * SM4_BLOCK_SIZE;
- }
-
- kernel_fpu_end();
-
- /* tail */
- if (walk.nbytes == walk.total && nbytes > 0) {
- u8 keystream[SM4_BLOCK_SIZE];
-
- sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
- crypto_xor_cpy(dst, src, keystream, nbytes);
- nbytes = 0;
- }
-
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt);
-
-static int cfb_decrypt(struct skcipher_request *req)
-{
- return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
- sm4_aesni_avx_cfb_dec_blk8);
-}
-
int sm4_avx_ctr_crypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func)
{
@@ -408,24 +296,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__cfb(sm4)",
- .cra_driver_name = "__cfb-sm4-aesni-avx",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct sm4_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = SM4_KEY_SIZE,
- .max_keysize = SM4_KEY_SIZE,
- .ivsize = SM4_BLOCK_SIZE,
- .chunksize = SM4_BLOCK_SIZE,
- .walksize = 8 * SM4_BLOCK_SIZE,
- .setkey = sm4_skcipher_setkey,
- .encrypt = sm4_cfb_encrypt,
- .decrypt = cfb_decrypt,
- }, {
- .base = {
.cra_name = "__ctr(sm4)",
.cra_driver_name = "__ctr-sm4-aesni-avx",
.cra_priority = 400,
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f69076271..9f1d94790 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -175,8 +175,7 @@ For 32-bit we have the following conventions - kernel is built with
#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
-.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
- ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -206,13 +205,20 @@ For 32-bit we have the following conventions - kernel is built with
/* Flip the PGD to the user version */
orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
mov \scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
pushq %rax
- SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
+.Lend_\@:
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9c0b26ae5..6de50b807 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/init.h>
#ifdef CONFIG_XEN_PV
#include <xen/xen-ops.h>
@@ -48,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = sys_call_table[unr](regs);
+ regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
@@ -65,13 +66,14 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call_table[xnr](regs);
+ regs->ax = x32_sys_call(regs, xnr);
return true;
}
return false;
}
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -85,6 +87,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
instrumentation_end();
syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif
@@ -98,7 +140,13 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
}
#ifdef CONFIG_IA32_EMULATION
-bool __ia32_enabled __ro_after_init = true;
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
#endif
/*
@@ -114,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
if (likely(unr < IA32_NR_syscalls)) {
unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[unr](regs);
+ regs->ax = ia32_sys_call(regs, unr);
} else if (nr != -1) {
regs->ax = __ia32_sys_ni_syscall(regs);
}
@@ -141,7 +189,7 @@ static __always_inline bool int80_is_external(void)
}
/**
- * int80_emulation - 32-bit legacy syscall entry
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
*
* This entry point can be used by 32-bit and 64-bit programs to perform
* 32-bit system calls. Instances of INT $0x80 can be found inline in
@@ -159,7 +207,7 @@ static __always_inline bool int80_is_external(void)
* eax: system call number
* ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
*/
-DEFINE_IDTENTRY_RAW(int80_emulation)
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
{
int nr;
@@ -277,8 +325,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return true;
}
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
@@ -296,41 +344,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
/* Invoke the syscall. If it failed, keep it simple: use IRET. */
if (!__do_fast_syscall_32(regs))
- return 0;
+ return false;
-#ifdef CONFIG_X86_64
/*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
*/
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
- /*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
- */
- return static_cpu_has(X86_FEATURE_SEP) &&
- regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
}
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
{
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
regs->sp = regs->bp;
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index bfb7bcb36..003379049 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -3,9 +3,12 @@
* Common place for both 32- and 64-bit entry routines.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/msr-index.h>
+#include <asm/unwind_hints.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
.pushsection .noinstr.text, "ax"
@@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb)
EXPORT_SYMBOL_GPL(entry_ibpb);
.popsection
+
+/*
+ * Define the VERW operand that is disguised as entry code so that
+ * it can be referenced with KPTI enabled. This ensure VERW can be
+ * used late in exit-to-user path after page tables are switched.
+ */
+.pushsection .entry.text, "ax"
+
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_START_NOALIGN(mds_verw_sel)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ .word __KERNEL_DS
+.align L1_CACHE_BYTES, 0xcc
+SYM_CODE_END(mds_verw_sel);
+/* For KVM */
+EXPORT_SYMBOL_GPL(mds_verw_sel);
+
+.popsection
+
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6e6af42e0..fba427646 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl %esp, %eax
call do_SYSENTER_32
- testl %eax, %eax
+ testb %al, %al
jz .Lsyscall_32_done
STACKLEAK_ERASE
@@ -885,6 +885,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
popfl
popl %eax
+ CLEAR_CPU_BUFFERS
/*
* Return back to the vDSO, which will pop ecx and edx.
@@ -954,6 +955,7 @@ restore_all_switch_stack:
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
+ CLEAR_CPU_BUFFERS
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@@ -1146,6 +1148,7 @@ SYM_CODE_START(asm_exc_nmi)
/* Not on SYSENTER stack. */
call exc_nmi
+ CLEAR_CPU_BUFFERS
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 43606de22..e986331b1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -18,6 +18,7 @@
* - SYM_FUNC_START/END:Define functions in the symbol table.
* - idtentry: Define exception entry points.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
@@ -34,7 +35,6 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
-#include <asm/export.h>
#include <asm/frame.h>
#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
@@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
@@ -126,70 +127,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/
- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
- * restore RF properly. If the slowpath sets it for whatever reason, we
- * need to restore it correctly.
- *
- * SYSRET can restore TF, but unlike IRET, restoring TF results in a
- * trap from userspace immediately after SYSRET. This would cause an
- * infinite loop whenever #DB happens with register state that satisfies
- * the opportunistic SYSRET conditions. For example, single-stepping
- * this user code:
- *
- * movq $stuck_here, %rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
@@ -223,6 +162,7 @@ syscall_return_via_sysret:
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
+ CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -621,17 +561,28 @@ __irqentry_text_end:
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
IBRS_EXIT
-#ifdef CONFIG_DEBUG_ENTRY
- /* Assert that pt_regs indicates user mode. */
- testb $3, CS(%rsp)
- jnz 1f
- ud2
-1:
-#endif
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
+#endif
+ STACKLEAK_ERASE
+ POP_REGS
+ add $8, %rsp /* orig_ax */
+ UNWIND_HINT_IRET_REGS
+
+.Lswapgs_and_iret:
+ swapgs
+ CLEAR_CPU_BUFFERS
+ /* Assert that the IRET frame indicates user mode. */
+ testb $3, 8(%rsp)
+ jnz .Lnative_iret
+ ud2
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
/*
@@ -658,13 +609,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
STACKLEAK_ERASE_NOCLOBBER
- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ push %rax
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
+ pop %rax
/* Restore RDI. */
popq %rdi
- swapgs
- jmp .Lnative_iret
-
+ jmp .Lswapgs_and_iret
+#endif
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
@@ -774,6 +726,8 @@ native_irq_return_ldt:
*/
popq %rax /* Restore user RAX */
+ CLEAR_CPU_BUFFERS
+
/*
* RSP now points to an ordinary IRET frame, except that the page
* is read-only and RSP[31:16] are preloaded with the userspace
@@ -1163,8 +1117,8 @@ SYM_CODE_START(asm_exc_nmi)
* anyway.
*
* To handle this case we do the following:
- * Check the a special location on the stack that contains
- * a variable that is set when NMIs are executing.
+ * Check a special location on the stack that contains a
+ * variable that is set when NMIs are executing.
* The interrupted task's stack is also checked to see if it
* is an NMI stack.
* If the variable is not set and the stack is not the NMI
@@ -1237,7 +1191,6 @@ SYM_CODE_START(asm_exc_nmi)
*/
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/*
@@ -1295,8 +1248,8 @@ SYM_CODE_START(asm_exc_nmi)
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call exc_nmi() anyway, so we can just
- * resume the outer NMI.
+ * about to call exc_nmi() anyway, so we can just resume
+ * the outer NMI.
*/
movq $repeat_nmi, %rdx
@@ -1451,7 +1404,6 @@ end_repeat_nmi:
UNWIND_HINT_REGS
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
@@ -1503,6 +1455,12 @@ nmi_restore:
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
+ * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
+ * NMI in kernel after user state is restored. For an unprivileged user
+ * these conditions are hard to meet.
+ */
+
+ /*
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
* cannot result in a fault. Similarly, we don't need to worry
@@ -1511,18 +1469,17 @@ nmi_restore:
iretq
SYM_CODE_END(asm_exc_nmi)
-#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
* MSRs to fully disable 32-bit SYSCALL.
*/
-SYM_CODE_START(ignore_sysret)
+SYM_CODE_START(entry_SYSCALL32_ignore)
UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
+ CLEAR_CPU_BUFFERS
sysretl
-SYM_CODE_END(ignore_sysret)
-#endif
+SYM_CODE_END(entry_SYSCALL32_ignore)
.pushsection .text, "ax"
__FUNC_ALIGN
@@ -1538,3 +1495,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
call make_task_dead
SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
+
+/*
+ * This sequence executes branches in order to remove user branch information
+ * from the branch history tracker in the Branch Predictor, therefore removing
+ * user influence on subsequent BTB lookups.
+ *
+ * It should be used on parts prior to Alder Lake. Newer parts should use the
+ * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
+ * virtualized on newer hardware the VMM should protect against BHI attacks by
+ * setting BHI_DIS_S for the guests.
+ *
+ * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
+ * and not clearing the branch history. The call tree looks like:
+ *
+ * call 1
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ *
+ * This means that the stack is non-constant and ORC can't unwind it with %rsp
+ * alone. Therefore we unconditionally set up the frame pointer, which allows
+ * ORC to unwind properly.
+ *
+ * The alignment is for performance and not for safety, and may be safely
+ * refactored in the future if needed.
+ */
+SYM_FUNC_START(clear_bhb_loop)
+ push %rbp
+ mov %rsp, %rbp
+ movl $5, %ecx
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+ jmp 5f
+ .align 64, 0xcc
+ ANNOTATE_INTRA_FUNCTION_CALL
+1: call 2f
+ RET
+ .align 64, 0xcc
+2: movl $5, %eax
+3: jmp 4f
+ nop
+4: sub $1, %eax
+ jnz 3b
+ sub $1, %ecx
+ jnz 1b
+ RET
+5: lfence
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_loop)
+EXPORT_SYMBOL_GPL(clear_bhb_loop)
+STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 4e88f8438..c779046cc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -92,6 +92,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
/*
* SYSENTER doesn't filter flags, so we need to clear NT and AC
@@ -118,9 +119,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
movq %rsp, %rdi
call do_SYSENTER_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@@ -209,16 +207,19 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
movq %rsp, %rdi
call do_fast_syscall_32
+
+sysret32_from_system_call:
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
- /* Opportunistic SYSRET */
-sysret32_from_system_call:
/*
+ * Opportunistic SYSRET
+ *
* We are not going to return to userspace from the trampoline
* stack. So let's erase the thread stack right now.
*/
@@ -271,8 +272,23 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
xorl %r9d, %r9d
xorl %r10d, %r10d
swapgs
+ CLEAR_CPU_BUFFERS
sysretl
SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_compat)
+
+/*
+ * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries
+ * point to C routines, however since this is a system call interface the branch
+ * history needs to be scrubbed to protect against BHI attacks, and that
+ * scrubbing needs to take place in assembly code prior to entering any C
+ * routines.
+ */
+SYM_CODE_START(int80_emulation)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ CLEAR_BRANCH_HISTORY
+ jmp do_int80_emulation
+SYM_CODE_END(int80_emulation)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cfc9bc73..c2235bae1 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -18,8 +18,25 @@
#include <asm/syscalls_32.h>
#undef __SYSCALL
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#ifdef CONFIG_X86_32
#define __SYSCALL(nr, sym) __ia32_##sym,
-
-__visible const sys_call_ptr_t ia32_sys_call_table[] = {
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_32.h>
};
+#undef __SYSCALL
+#endif
+
+#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
+
+long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_32.h>
+ default: return __ia32_sys_ni_syscall(regs);
+ }
+};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index be120eec1..33b3f09e6 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -11,8 +11,23 @@
#include <asm/syscalls_64.h>
#undef __SYSCALL
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
#define __SYSCALL(nr, sym) __x64_##sym,
-
-asmlinkage const sys_call_ptr_t sys_call_table[] = {
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+
+long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_64.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+};
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index bdd0e03a1..03de4a932 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -11,8 +11,12 @@
#include <asm/syscalls_x32.h>
#undef __SYSCALL
-#define __SYSCALL(nr, sym) __x64_##sym,
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
-#include <asm/syscalls_x32.h>
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
};
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd86..5f8591ce7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -264,7 +264,7 @@
250 i386 fadvise64 sys_ia32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
252 i386 exit_group sys_exit_group
-253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
+253 i386 lookup_dcookie
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
256 i386 epoll_wait sys_epoll_wait
@@ -457,3 +457,12 @@
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
451 i386 cachestat sys_cachestat
452 i386 fchmodat2 sys_fchmodat2
+453 i386 map_shadow_stack sys_map_shadow_stack
+454 i386 futex_wake sys_futex_wake
+455 i386 futex_wait sys_futex_wait
+456 i386 futex_requeue sys_futex_requeue
+457 i386 statmount sys_statmount
+458 i386 listmount sys_listmount
+459 i386 lsm_get_self_attr sys_lsm_get_self_attr
+460 i386 lsm_set_self_attr sys_lsm_set_self_attr
+461 i386 lsm_list_modules sys_lsm_list_modules
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30e..7e8d46f41 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -220,7 +220,7 @@
209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
-212 common lookup_dcookie sys_lookup_dcookie
+212 common lookup_dcookie
213 common epoll_create sys_epoll_create
214 64 epoll_ctl_old
215 64 epoll_wait_old
@@ -375,6 +375,14 @@
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
453 64 map_shadow_stack sys_map_shadow_stack
+454 common futex_wake sys_futex_wake
+455 common futex_wait sys_futex_wait
+456 common futex_requeue sys_futex_requeue
+457 common statmount sys_statmount
+458 common listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index ff6e7003d..0103e103a 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -4,9 +4,9 @@
* Copyright 2008 by Steven Rostedt, Red Hat, Inc
* (inspired by Andi Kleen's thunk_64.S)
*/
+ #include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>
- #include <asm/export.h>
/* put return address in eax (arg1) */
.macro THUNK name, func, put_ret_addr_in_eax=0
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 27b5da211..416b400f3 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -4,10 +4,10 @@
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include "calling.h"
#include <asm/asm.h>
-#include <asm/export.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 6a1821bd7..73ac5adb0 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -34,15 +34,20 @@ obj-y += vma.o extable.o
KASAN_SANITIZE_vma.o := y
UBSAN_SANITIZE_vma.o := y
KCSAN_SANITIZE_vma.o := y
-OBJECT_FILES_NON_STANDARD_vma.o := n
-OBJECT_FILES_NON_STANDARD_extable.o := n
+
+OBJECT_FILES_NON_STANDARD_extable.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-x32.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n
+OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
+OBJECT_FILES_NON_STANDARD_vma.o := n
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32
-obj-$(VDSO32-y) += vdso32-setup.o
+obj-$(VDSO32-y) += vdso32-setup.o
vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
@@ -190,31 +195,4 @@ GCOV_PROFILE := n
quiet_cmd_vdso_and_check = VDSO $@
cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
-#
-# Install the unstripped copies of vdso*.so. If our toolchain supports
-# build-id, install .build-id links as well.
-#
-quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
-define cmd_vdso_install
- cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
- if readelf -n $< |grep -q 'Build ID'; then \
- buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
- first=`echo $$buildid | cut -b-2`; \
- last=`echo $$buildid | cut -b3-`; \
- mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
- ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
- fi
-endef
-
-vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
-
-$(MODLIB)/vdso: FORCE
- @mkdir -p $(MODLIB)/vdso
-
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
- $(call cmd,vdso_install)
-
-PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets)
-
clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 7d70935b6..0debc194b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -11,12 +11,10 @@
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/types.h>
+#include <vdso/gettime.h>
#include "../../../../lib/vdso/gettimeofday.c"
-extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
-extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
-
int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
return __cvdso_gettimeofday(tv, tz);
@@ -35,9 +33,6 @@ __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__v
#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
/* both 64-bit and x32 use these */
-extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
-
int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
{
return __cvdso_clock_gettime(clock, ts);
@@ -56,9 +51,6 @@ int clock_getres(clockid_t, struct __kernel_timespec *)
#else
/* i386 only */
-extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
-
int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
{
return __cvdso_clock_gettime32(clock, ts);
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index f3b3cacbc..76e4e74f3 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -67,7 +67,6 @@ static struct ctl_table abi_table2[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static __init int ia32_binfmt_init(void)
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index d77d278ee..37a3d4c02 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/errno.h>
#include <asm/enclu.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e0ca8120a..1245000a8 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;
@@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
- struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;
@@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code,
goto sigsegv;
}
- tsk = current;
-
/*
* Check for access_ok violations and find the syscall nr.
*
@@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */
/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}
- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}
regs->ax = ret;
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index ed3087192..780acd3df 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -125,7 +125,7 @@ int amd_brs_hw_config(struct perf_event *event)
* Where X is the number of taken branches due to interrupt
* skid. Skid is large.
*
- * Where Y is the occurences of the event while BRS is
+ * Where Y is the occurrences of the event while BRS is
* capturing the lbr_nr entries.
*
* By using retired taken branches, we limit the impact on the
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e24976593..eb5c8539d 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -250,7 +250,7 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
/*
* AMD Performance Monitor Family 17h and later:
*/
-static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
+static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -262,10 +262,24 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
};
+static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9,
+};
+
static u64 amd_pmu_event_map(int hw_event)
{
- if (boot_cpu_data.x86 >= 0x17)
- return amd_f17h_perfmon_event_map[hw_event];
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19)
+ return amd_zen2_perfmon_event_map[hw_event];
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN1))
+ return amd_zen1_perfmon_event_map[hw_event];
return amd_perfmon_event_map[hw_event];
}
@@ -604,7 +618,6 @@ static void amd_pmu_cpu_dead(int cpu)
kfree(cpuhw->lbr_sel);
cpuhw->lbr_sel = NULL;
- amd_pmu_cpu_reset(cpu);
if (!x86_pmu.amd_nb_constraints)
return;
@@ -940,7 +953,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
continue;
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -1184,7 +1197,7 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
* period of each one and given that the BRS saturates, it would not be possible
* to guarantee correlated content for all events. Therefore, in situations
* where multiple events want to use BRS, the kernel enforces mutual exclusion.
- * Exclusion is enforced by chosing only one counter for events using BRS.
+ * Exclusion is enforced by choosing only one counter for events using BRS.
* The event scheduling logic will then automatically multiplex the
* events and ensure that at most one event is actively using BRS.
*
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 6911c5399..e91970b01 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -287,6 +287,9 @@ static int perf_ibs_init(struct perf_event *event)
if (config & ~perf_ibs->config_mask)
return -EINVAL;
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
ret = validate_group(event);
if (ret)
return ret;
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index eb31f8508..4a1e60031 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -173,9 +173,11 @@ void amd_pmu_lbr_read(void)
/*
* Check if a branch has been logged; if valid = 0, spec = 0
- * then no branch was recorded
+ * then no branch was recorded; if reserved = 1 then an
+ * erroneous branch was recorded (see Erratum 1452)
*/
- if (!entry.to.split.valid && !entry.to.split.spec)
+ if ((!entry.to.split.valid && !entry.to.split.spec) ||
+ entry.to.split.reserved)
continue;
perf_clear_branch_entry_bitfields(br + out);
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 83f15fe41..5bf03c575 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -26,57 +26,66 @@
#define RDPMC_BASE_LLC 10
#define COUNTER_SHIFT 16
+#define UNCORE_NAME_LEN 16
+#define UNCORE_GROUP_MAX 256
#undef pr_fmt
#define pr_fmt(fmt) "amd_uncore: " fmt
static int pmu_version;
-static int num_counters_llc;
-static int num_counters_nb;
-static bool l3_mask;
-static HLIST_HEAD(uncore_unused_list);
-
-struct amd_uncore {
- int id;
+struct amd_uncore_ctx {
int refcnt;
int cpu;
+ struct perf_event **events;
+ struct hlist_node node;
+};
+
+struct amd_uncore_pmu {
+ char name[UNCORE_NAME_LEN];
int num_counters;
int rdpmc_base;
u32 msr_base;
- cpumask_t *active_mask;
- struct pmu *pmu;
- struct perf_event **events;
- struct hlist_node node;
+ int group;
+ cpumask_t active_mask;
+ struct pmu pmu;
+ struct amd_uncore_ctx * __percpu *ctx;
};
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_llc;
+enum {
+ UNCORE_TYPE_DF,
+ UNCORE_TYPE_L3,
+ UNCORE_TYPE_UMC,
-static struct pmu amd_nb_pmu;
-static struct pmu amd_llc_pmu;
+ UNCORE_TYPE_MAX
+};
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_llc_active_mask;
+union amd_uncore_info {
+ struct {
+ u64 aux_data:32; /* auxiliary data */
+ u64 num_pmcs:8; /* number of counters */
+ u64 gid:8; /* group id */
+ u64 cid:8; /* context id */
+ } split;
+ u64 full;
+};
-static bool is_nb_event(struct perf_event *event)
-{
- return event->pmu->type == amd_nb_pmu.type;
-}
+struct amd_uncore {
+ union amd_uncore_info * __percpu info;
+ struct amd_uncore_pmu *pmus;
+ unsigned int num_pmus;
+ bool init_done;
+ void (*scan)(struct amd_uncore *uncore, unsigned int cpu);
+ int (*init)(struct amd_uncore *uncore, unsigned int cpu);
+ void (*move)(struct amd_uncore *uncore, unsigned int cpu);
+ void (*free)(struct amd_uncore *uncore, unsigned int cpu);
+};
-static bool is_llc_event(struct perf_event *event)
-{
- return event->pmu->type == amd_llc_pmu.type;
-}
+static struct amd_uncore uncores[UNCORE_TYPE_MAX];
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event)
{
- if (is_nb_event(event) && amd_uncore_nb)
- return *per_cpu_ptr(amd_uncore_nb, event->cpu);
- else if (is_llc_event(event) && amd_uncore_llc)
- return *per_cpu_ptr(amd_uncore_llc, event->cpu);
-
- return NULL;
+ return container_of(event->pmu, struct amd_uncore_pmu, pmu);
}
static void amd_uncore_read(struct perf_event *event)
@@ -91,7 +100,16 @@ static void amd_uncore_read(struct perf_event *event)
*/
prev = local64_read(&hwc->prev_count);
- rdpmcl(hwc->event_base_rdpmc, new);
+
+ /*
+ * Some uncore PMUs do not have RDPMC assignments. In such cases,
+ * read counts directly from the corresponding PERF_CTR.
+ */
+ if (hwc->event_base_rdpmc < 0)
+ rdmsrl(hwc->event_base, new);
+ else
+ rdpmcl(hwc->event_base_rdpmc, new);
+
local64_set(&hwc->prev_count, new);
delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
@@ -118,7 +136,7 @@ static void amd_uncore_stop(struct perf_event *event, int flags)
hwc->state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- amd_uncore_read(event);
+ event->pmu->read(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
@@ -126,15 +144,16 @@ static void amd_uncore_stop(struct perf_event *event, int flags)
static int amd_uncore_add(struct perf_event *event, int flags)
{
int i;
- struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
struct hw_perf_event *hwc = &event->hw;
/* are we already assigned? */
- if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+ if (hwc->idx != -1 && ctx->events[hwc->idx] == event)
goto out;
- for (i = 0; i < uncore->num_counters; i++) {
- if (uncore->events[i] == event) {
+ for (i = 0; i < pmu->num_counters; i++) {
+ if (ctx->events[i] == event) {
hwc->idx = i;
goto out;
}
@@ -142,8 +161,8 @@ static int amd_uncore_add(struct perf_event *event, int flags)
/* if not, take the first available counter */
hwc->idx = -1;
- for (i = 0; i < uncore->num_counters; i++) {
- if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+ for (i = 0; i < pmu->num_counters; i++) {
+ if (cmpxchg(&ctx->events[i], NULL, event) == NULL) {
hwc->idx = i;
break;
}
@@ -153,23 +172,16 @@ out:
if (hwc->idx == -1)
return -EBUSY;
- hwc->config_base = uncore->msr_base + (2 * hwc->idx);
- hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
- hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+ hwc->config_base = pmu->msr_base + (2 * hwc->idx);
+ hwc->event_base = pmu->msr_base + 1 + (2 * hwc->idx);
+ hwc->event_base_rdpmc = pmu->rdpmc_base + hwc->idx;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- /*
- * The first four DF counters are accessible via RDPMC index 6 to 9
- * followed by the L3 counters from index 10 to 15. For processors
- * with more than four DF counters, the DF RDPMC assignments become
- * discontiguous as the additional counters are accessible starting
- * from index 16.
- */
- if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB)
- hwc->event_base_rdpmc += NUM_COUNTERS_L3;
+ if (pmu->rdpmc_base < 0)
+ hwc->event_base_rdpmc = -1;
if (flags & PERF_EF_START)
- amd_uncore_start(event, PERF_EF_RELOAD);
+ event->pmu->start(event, PERF_EF_RELOAD);
return 0;
}
@@ -177,55 +189,36 @@ out:
static void amd_uncore_del(struct perf_event *event, int flags)
{
int i;
- struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+ struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
struct hw_perf_event *hwc = &event->hw;
- amd_uncore_stop(event, PERF_EF_UPDATE);
+ event->pmu->stop(event, PERF_EF_UPDATE);
- for (i = 0; i < uncore->num_counters; i++) {
- if (cmpxchg(&uncore->events[i], event, NULL) == event)
+ for (i = 0; i < pmu->num_counters; i++) {
+ if (cmpxchg(&ctx->events[i], event, NULL) == event)
break;
}
hwc->idx = -1;
}
-/*
- * Return a full thread and slice mask unless user
- * has provided them
- */
-static u64 l3_thread_slice_mask(u64 config)
-{
- if (boot_cpu_data.x86 <= 0x18)
- return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) |
- ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK);
-
- /*
- * If the user doesn't specify a threadmask, they're not trying to
- * count core 0, so we enable all cores & threads.
- * We'll also assume that they want to count slice 0 if they specify
- * a threadmask and leave sliceid and enallslices unpopulated.
- */
- if (!(config & AMD64_L3_F19H_THREAD_MASK))
- return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES |
- AMD64_L3_EN_ALL_CORES;
-
- return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK |
- AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES |
- AMD64_L3_COREID_MASK);
-}
-
static int amd_uncore_event_init(struct perf_event *event)
{
- struct amd_uncore *uncore;
+ struct amd_uncore_pmu *pmu;
+ struct amd_uncore_ctx *ctx;
struct hw_perf_event *hwc = &event->hw;
- u64 event_mask = AMD64_RAW_EVENT_MASK_NB;
if (event->attr.type != event->pmu->type)
return -ENOENT;
- if (pmu_version >= 2 && is_nb_event(event))
- event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB;
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ pmu = event_to_amd_uncore_pmu(event);
+ ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+ if (!ctx)
+ return -ENODEV;
/*
* NB and Last level cache counters (MSRs) are shared across all cores
@@ -235,28 +228,14 @@ static int amd_uncore_event_init(struct perf_event *event)
* out. So we do not support sampling and per-thread events via
* CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
*/
- hwc->config = event->attr.config & event_mask;
+ hwc->config = event->attr.config;
hwc->idx = -1;
- if (event->cpu < 0)
- return -EINVAL;
-
- /*
- * SliceMask and ThreadMask need to be set for certain L3 events.
- * For other events, the two fields do not affect the count.
- */
- if (l3_mask && is_llc_event(event))
- hwc->config |= l3_thread_slice_mask(event->attr.config);
-
- uncore = event_to_amd_uncore(event);
- if (!uncore)
- return -ENODEV;
-
/*
* since request can come in to any of the shared cores, we will remap
* to a single common cpu.
*/
- event->cpu = uncore->cpu;
+ event->cpu = ctx->cpu;
return 0;
}
@@ -278,17 +257,10 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- cpumask_t *active_mask;
- struct pmu *pmu = dev_get_drvdata(dev);
+ struct pmu *ptr = dev_get_drvdata(dev);
+ struct amd_uncore_pmu *pmu = container_of(ptr, struct amd_uncore_pmu, pmu);
- if (pmu->type == amd_nb_pmu.type)
- active_mask = &amd_nb_active_mask;
- else if (pmu->type == amd_llc_pmu.type)
- active_mask = &amd_llc_active_mask;
- else
- return 0;
-
- return cpumap_print_to_pagebuf(true, buf, active_mask);
+ return cpumap_print_to_pagebuf(true, buf, &pmu->active_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
@@ -315,7 +287,7 @@ static struct device_attribute format_attr_##_var = \
DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */
DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */
-DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */
+DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3, PerfMonV2 UMC */
DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */
DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */
@@ -325,6 +297,7 @@ DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L
DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */
DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */
DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(rdwrmask, rdwrmask, "config:8-9"); /* PerfMonV2 UMC */
/* Common DF and NB attributes */
static struct attribute *amd_uncore_df_format_attr[] = {
@@ -341,6 +314,13 @@ static struct attribute *amd_uncore_l3_format_attr[] = {
NULL,
};
+/* Common UMC attributes */
+static struct attribute *amd_uncore_umc_format_attr[] = {
+ &format_attr_event8.attr, /* event */
+ &format_attr_rdwrmask.attr, /* rdwrmask */
+ NULL,
+};
+
/* F17h unique L3 attributes */
static struct attribute *amd_f17h_uncore_l3_format_attr[] = {
&format_attr_slicemask.attr, /* slicemask */
@@ -378,6 +358,11 @@ static struct attribute_group amd_f19h_uncore_l3_format_group = {
.is_visible = amd_f19h_uncore_is_visible,
};
+static struct attribute_group amd_uncore_umc_format_group = {
+ .name = "format",
+ .attrs = amd_uncore_umc_format_attr,
+};
+
static const struct attribute_group *amd_uncore_df_attr_groups[] = {
&amd_uncore_attr_group,
&amd_uncore_df_format_group,
@@ -396,259 +381,636 @@ static const struct attribute_group *amd_uncore_l3_attr_update[] = {
NULL,
};
-static struct pmu amd_nb_pmu = {
- .task_ctx_nr = perf_invalid_context,
- .attr_groups = amd_uncore_df_attr_groups,
- .name = "amd_nb",
- .event_init = amd_uncore_event_init,
- .add = amd_uncore_add,
- .del = amd_uncore_del,
- .start = amd_uncore_start,
- .stop = amd_uncore_stop,
- .read = amd_uncore_read,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
- .module = THIS_MODULE,
+static const struct attribute_group *amd_uncore_umc_attr_groups[] = {
+ &amd_uncore_attr_group,
+ &amd_uncore_umc_format_group,
+ NULL,
};
-static struct pmu amd_llc_pmu = {
- .task_ctx_nr = perf_invalid_context,
- .attr_groups = amd_uncore_l3_attr_groups,
- .attr_update = amd_uncore_l3_attr_update,
- .name = "amd_l2",
- .event_init = amd_uncore_event_init,
- .add = amd_uncore_add,
- .del = amd_uncore_del,
- .start = amd_uncore_start,
- .stop = amd_uncore_stop,
- .read = amd_uncore_read,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
- .module = THIS_MODULE,
-};
+static __always_inline
+int amd_uncore_ctx_cid(struct amd_uncore *uncore, unsigned int cpu)
+{
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.cid;
+}
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+static __always_inline
+int amd_uncore_ctx_gid(struct amd_uncore *uncore, unsigned int cpu)
{
- return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
- cpu_to_node(cpu));
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.gid;
}
-static inline struct perf_event **
-amd_uncore_events_alloc(unsigned int num, unsigned int cpu)
+static __always_inline
+int amd_uncore_ctx_num_pmcs(struct amd_uncore *uncore, unsigned int cpu)
{
- return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL,
- cpu_to_node(cpu));
+ union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+ return info->split.num_pmcs;
}
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+static void amd_uncore_ctx_free(struct amd_uncore *uncore, unsigned int cpu)
{
- struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL;
+ struct amd_uncore_pmu *pmu;
+ struct amd_uncore_ctx *ctx;
+ int i;
- if (amd_uncore_nb) {
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
- uncore_nb = amd_uncore_alloc(cpu);
- if (!uncore_nb)
- goto fail;
- uncore_nb->cpu = cpu;
- uncore_nb->num_counters = num_counters_nb;
- uncore_nb->rdpmc_base = RDPMC_BASE_NB;
- uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore_nb->active_mask = &amd_nb_active_mask;
- uncore_nb->pmu = &amd_nb_pmu;
- uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu);
- if (!uncore_nb->events)
- goto fail;
- uncore_nb->id = -1;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+ if (!uncore->init_done)
+ return;
+
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ ctx = *per_cpu_ptr(pmu->ctx, cpu);
+ if (!ctx)
+ continue;
+
+ if (cpu == ctx->cpu)
+ cpumask_clear_cpu(cpu, &pmu->active_mask);
+
+ if (!--ctx->refcnt) {
+ kfree(ctx->events);
+ kfree(ctx);
+ }
+
+ *per_cpu_ptr(pmu->ctx, cpu) = NULL;
}
+}
- if (amd_uncore_llc) {
- *per_cpu_ptr(amd_uncore_llc, cpu) = NULL;
- uncore_llc = amd_uncore_alloc(cpu);
- if (!uncore_llc)
- goto fail;
- uncore_llc->cpu = cpu;
- uncore_llc->num_counters = num_counters_llc;
- uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
- uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore_llc->active_mask = &amd_llc_active_mask;
- uncore_llc->pmu = &amd_llc_pmu;
- uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu);
- if (!uncore_llc->events)
- goto fail;
- uncore_llc->id = -1;
- *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
+static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct amd_uncore_ctx *curr, *prev;
+ struct amd_uncore_pmu *pmu;
+ int node, cid, gid, i, j;
+
+ if (!uncore->init_done || !uncore->num_pmus)
+ return 0;
+
+ cid = amd_uncore_ctx_cid(uncore, cpu);
+ gid = amd_uncore_ctx_gid(uncore, cpu);
+
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ *per_cpu_ptr(pmu->ctx, cpu) = NULL;
+ curr = NULL;
+
+ /* Check for group exclusivity */
+ if (gid != pmu->group)
+ continue;
+
+ /* Find a sibling context */
+ for_each_online_cpu(j) {
+ if (cpu == j)
+ continue;
+
+ prev = *per_cpu_ptr(pmu->ctx, j);
+ if (!prev)
+ continue;
+
+ if (cid == amd_uncore_ctx_cid(uncore, j)) {
+ curr = prev;
+ break;
+ }
+ }
+
+ /* Allocate context if sibling does not exist */
+ if (!curr) {
+ node = cpu_to_node(cpu);
+ curr = kzalloc_node(sizeof(*curr), GFP_KERNEL, node);
+ if (!curr)
+ goto fail;
+
+ curr->cpu = cpu;
+ curr->events = kzalloc_node(sizeof(*curr->events) *
+ pmu->num_counters,
+ GFP_KERNEL, node);
+ if (!curr->events) {
+ kfree(curr);
+ goto fail;
+ }
+
+ cpumask_set_cpu(cpu, &pmu->active_mask);
+ }
+
+ curr->refcnt++;
+ *per_cpu_ptr(pmu->ctx, cpu) = curr;
}
return 0;
fail:
- if (uncore_nb) {
- kfree(uncore_nb->events);
- kfree(uncore_nb);
- }
-
- if (uncore_llc) {
- kfree(uncore_llc->events);
- kfree(uncore_llc);
- }
+ amd_uncore_ctx_free(uncore, cpu);
return -ENOMEM;
}
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
- struct amd_uncore * __percpu *uncores)
+static void amd_uncore_ctx_move(struct amd_uncore *uncore, unsigned int cpu)
{
- unsigned int cpu;
- struct amd_uncore *that;
-
- for_each_online_cpu(cpu) {
- that = *per_cpu_ptr(uncores, cpu);
+ struct amd_uncore_ctx *curr, *next;
+ struct amd_uncore_pmu *pmu;
+ int i, j;
- if (!that)
- continue;
+ if (!uncore->init_done)
+ return;
- if (this == that)
+ for (i = 0; i < uncore->num_pmus; i++) {
+ pmu = &uncore->pmus[i];
+ curr = *per_cpu_ptr(pmu->ctx, cpu);
+ if (!curr)
continue;
- if (this->id == that->id) {
- hlist_add_head(&this->node, &uncore_unused_list);
- this = that;
- break;
+ /* Migrate to a shared sibling if possible */
+ for_each_online_cpu(j) {
+ next = *per_cpu_ptr(pmu->ctx, j);
+ if (!next || cpu == j)
+ continue;
+
+ if (curr == next) {
+ perf_pmu_migrate_context(&pmu->pmu, cpu, j);
+ cpumask_clear_cpu(cpu, &pmu->active_mask);
+ cpumask_set_cpu(j, &pmu->active_mask);
+ next->cpu = j;
+ break;
+ }
}
}
-
- this->refcnt++;
- return this;
}
static int amd_uncore_cpu_starting(unsigned int cpu)
{
- unsigned int eax, ebx, ecx, edx;
struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->scan(uncore, cpu);
+ }
+
+ return 0;
+}
- if (amd_uncore_nb) {
- uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- uncore->id = ecx & 0xff;
+static int amd_uncore_cpu_online(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
- uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (uncore->init(uncore, cpu))
+ break;
}
- if (amd_uncore_llc) {
- uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
- uncore->id = get_llc_id(cpu);
+ return 0;
+}
- uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
- *per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
+static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+ int i;
+
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->move(uncore, cpu);
}
return 0;
}
-static void uncore_clean_online(void)
+static int amd_uncore_cpu_dead(unsigned int cpu)
{
struct amd_uncore *uncore;
- struct hlist_node *n;
+ int i;
- hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
- hlist_del(&uncore->node);
- kfree(uncore->events);
- kfree(uncore);
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ uncore->free(uncore, cpu);
}
+
+ return 0;
}
-static void uncore_online(unsigned int cpu,
- struct amd_uncore * __percpu *uncores)
+static int amd_uncore_df_event_init(struct perf_event *event)
{
- struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = amd_uncore_event_init(event);
- uncore_clean_online();
+ if (ret || pmu_version < 2)
+ return ret;
- if (cpu == uncore->cpu)
- cpumask_set_cpu(cpu, uncore->active_mask);
+ hwc->config = event->attr.config &
+ (pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
+ AMD64_RAW_EVENT_MASK_NB);
+
+ return 0;
}
-static int amd_uncore_cpu_online(unsigned int cpu)
+static int amd_uncore_df_add(struct perf_event *event, int flags)
{
- if (amd_uncore_nb)
- uncore_online(cpu, amd_uncore_nb);
+ int ret = amd_uncore_add(event, flags & ~PERF_EF_START);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (ret)
+ return ret;
+
+ /*
+ * The first four DF counters are accessible via RDPMC index 6 to 9
+ * followed by the L3 counters from index 10 to 15. For processors
+ * with more than four DF counters, the DF RDPMC assignments become
+ * discontiguous as the additional counters are accessible starting
+ * from index 16.
+ */
+ if (hwc->idx >= NUM_COUNTERS_NB)
+ hwc->event_base_rdpmc += NUM_COUNTERS_L3;
- if (amd_uncore_llc)
- uncore_online(cpu, amd_uncore_llc);
+ /* Delayed start after rdpmc base update */
+ if (flags & PERF_EF_START)
+ amd_uncore_start(event, PERF_EF_RELOAD);
return 0;
}
-static void uncore_down_prepare(unsigned int cpu,
- struct amd_uncore * __percpu *uncores)
+static
+void amd_uncore_df_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
{
- unsigned int i;
- struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+ union cpuid_0x80000022_ebx ebx;
+ union amd_uncore_info info;
- if (this->cpu != cpu)
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_NB))
return;
- /* this cpu is going down, migrate to a shared sibling if possible */
- for_each_online_cpu(i) {
- struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+ info.split.aux_data = 0;
+ info.split.num_pmcs = NUM_COUNTERS_NB;
+ info.split.gid = 0;
+ info.split.cid = topology_die_id(cpu);
- if (cpu == i)
- continue;
+ if (pmu_version >= 2) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+ info.split.num_pmcs = ebx.split.num_df_pmc;
+ }
- if (this == that) {
- perf_pmu_migrate_context(this->pmu, cpu, i);
- cpumask_clear_cpu(cpu, that->active_mask);
- cpumask_set_cpu(i, that->active_mask);
- that->cpu = i;
- break;
- }
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct attribute **df_attr = amd_uncore_df_format_attr;
+ struct amd_uncore_pmu *pmu;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ /* No grouping, single instance for a system */
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+ if (!uncore->pmus) {
+ uncore->num_pmus = 0;
+ goto done;
}
+
+ /*
+ * For Family 17h and above, the Northbridge counters are repurposed
+ * as Data Fabric counters. The PMUs are exported based on family as
+ * either NB or DF.
+ */
+ pmu = &uncore->pmus[0];
+ strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_df" : "amd_nb",
+ sizeof(pmu->name));
+ pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ pmu->msr_base = MSR_F15H_NB_PERF_CTL;
+ pmu->rdpmc_base = RDPMC_BASE_NB;
+ pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+ if (pmu_version >= 2) {
+ *df_attr++ = &format_attr_event14v2.attr;
+ *df_attr++ = &format_attr_umask12.attr;
+ } else if (boot_cpu_data.x86 >= 0x17) {
+ *df_attr = &format_attr_event14.attr;
+ }
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_df_attr_groups,
+ .name = pmu->name,
+ .event_init = amd_uncore_df_event_init,
+ .add = amd_uncore_df_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s%s counters detected\n", pmu->num_counters,
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "",
+ pmu->pmu.name);
+
+ uncore->num_pmus = 1;
+
+done:
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
}
-static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+static int amd_uncore_l3_event_init(struct perf_event *event)
{
- if (amd_uncore_nb)
- uncore_down_prepare(cpu, amd_uncore_nb);
+ int ret = amd_uncore_event_init(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config = event->attr.config;
+ u64 mask;
- if (amd_uncore_llc)
- uncore_down_prepare(cpu, amd_uncore_llc);
+ hwc->config = config & AMD64_RAW_EVENT_MASK_NB;
+
+ /*
+ * SliceMask and ThreadMask need to be set for certain L3 events.
+ * For other events, the two fields do not affect the count.
+ */
+ if (ret || boot_cpu_data.x86 < 0x17)
+ return ret;
+
+ mask = config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK |
+ AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES |
+ AMD64_L3_COREID_MASK);
+
+ if (boot_cpu_data.x86 <= 0x18)
+ mask = ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) |
+ ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK);
+
+ /*
+ * If the user doesn't specify a ThreadMask, they're not trying to
+ * count core 0, so we enable all cores & threads.
+ * We'll also assume that they want to count slice 0 if they specify
+ * a ThreadMask and leave SliceId and EnAllSlices unpopulated.
+ */
+ else if (!(config & AMD64_L3_F19H_THREAD_MASK))
+ mask = AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES |
+ AMD64_L3_EN_ALL_CORES;
+
+ hwc->config |= mask;
return 0;
}
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+static
+void amd_uncore_l3_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
{
- struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+ union amd_uncore_info info;
- if (cpu == uncore->cpu)
- cpumask_clear_cpu(cpu, uncore->active_mask);
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_LLC))
+ return;
+
+ info.split.aux_data = 0;
+ info.split.num_pmcs = NUM_COUNTERS_L2;
+ info.split.gid = 0;
+ info.split.cid = per_cpu_llc_id(cpu);
- if (!--uncore->refcnt) {
- kfree(uncore->events);
- kfree(uncore);
+ if (boot_cpu_data.x86 >= 0x17)
+ info.split.num_pmcs = NUM_COUNTERS_L3;
+
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ struct attribute **l3_attr = amd_uncore_l3_format_attr;
+ struct amd_uncore_pmu *pmu;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ /* No grouping, single instance for a system */
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+ if (!uncore->pmus) {
+ uncore->num_pmus = 0;
+ goto done;
}
- *per_cpu_ptr(uncores, cpu) = NULL;
+ /*
+ * For Family 17h and above, L3 cache counters are available instead
+ * of L2 cache counters. The PMUs are exported based on family as
+ * either L2 or L3.
+ */
+ pmu = &uncore->pmus[0];
+ strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_l3" : "amd_l2",
+ sizeof(pmu->name));
+ pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ pmu->msr_base = MSR_F16H_L2I_PERF_CTL;
+ pmu->rdpmc_base = RDPMC_BASE_LLC;
+ pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+ if (boot_cpu_data.x86 >= 0x17) {
+ *l3_attr++ = &format_attr_event8.attr;
+ *l3_attr++ = &format_attr_umask8.attr;
+ *l3_attr++ = boot_cpu_data.x86 >= 0x19 ?
+ &format_attr_threadmask2.attr :
+ &format_attr_threadmask8.attr;
+ }
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_l3_attr_groups,
+ .attr_update = amd_uncore_l3_attr_update,
+ .name = pmu->name,
+ .event_init = amd_uncore_l3_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s%s counters detected\n", pmu->num_counters,
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON " : "",
+ pmu->pmu.name);
+
+ uncore->num_pmus = 1;
+
+done:
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
}
-static int amd_uncore_cpu_dead(unsigned int cpu)
+static int amd_uncore_umc_event_init(struct perf_event *event)
{
- if (amd_uncore_nb)
- uncore_dead(cpu, amd_uncore_nb);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = amd_uncore_event_init(event);
+
+ if (ret)
+ return ret;
- if (amd_uncore_llc)
- uncore_dead(cpu, amd_uncore_llc);
+ hwc->config = event->attr.config & AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC;
return 0;
}
-static int __init amd_uncore_init(void)
+static void amd_uncore_umc_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (flags & PERF_EF_RELOAD)
+ wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+ hwc->state = 0;
+ wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC));
+ perf_event_update_userpage(event);
+}
+
+static
+void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
{
- struct attribute **df_attr = amd_uncore_df_format_attr;
- struct attribute **l3_attr = amd_uncore_l3_format_attr;
union cpuid_0x80000022_ebx ebx;
+ union amd_uncore_info info;
+ unsigned int eax, ecx, edx;
+
+ if (pmu_version < 2)
+ return;
+
+ cpuid(EXT_PERFMON_DEBUG_FEATURES, &eax, &ebx.full, &ecx, &edx);
+ info.split.aux_data = ecx; /* stash active mask */
+ info.split.num_pmcs = ebx.split.num_umc_pmc;
+ info.split.gid = topology_die_id(cpu);
+ info.split.cid = topology_die_id(cpu);
+ *per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+ DECLARE_BITMAP(gmask, UNCORE_GROUP_MAX) = { 0 };
+ u8 group_num_pmus[UNCORE_GROUP_MAX] = { 0 };
+ u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 };
+ union amd_uncore_info info;
+ struct amd_uncore_pmu *pmu;
+ int index = 0, gid, i;
+
+ if (pmu_version < 2)
+ return 0;
+
+ /* Run just once */
+ if (uncore->init_done)
+ return amd_uncore_ctx_init(uncore, cpu);
+
+ /* Find unique groups */
+ for_each_online_cpu(i) {
+ info = *per_cpu_ptr(uncore->info, i);
+ gid = info.split.gid;
+ if (test_bit(gid, gmask))
+ continue;
+
+ __set_bit(gid, gmask);
+ group_num_pmus[gid] = hweight32(info.split.aux_data);
+ group_num_pmcs[gid] = info.split.num_pmcs;
+ uncore->num_pmus += group_num_pmus[gid];
+ }
+
+ uncore->pmus = kzalloc(sizeof(*uncore->pmus) * uncore->num_pmus,
+ GFP_KERNEL);
+ if (!uncore->pmus) {
+ uncore->num_pmus = 0;
+ goto done;
+ }
+
+ for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) {
+ for (i = 0; i < group_num_pmus[gid]; i++) {
+ pmu = &uncore->pmus[index];
+ snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index);
+ pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid];
+ pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2;
+ pmu->rdpmc_base = -1;
+ pmu->group = gid;
+
+ pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+ if (!pmu->ctx)
+ goto done;
+
+ pmu->pmu = (struct pmu) {
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_uncore_umc_attr_groups,
+ .name = pmu->name,
+ .event_init = amd_uncore_umc_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_umc_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+ };
+
+ if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ goto done;
+ }
+
+ pr_info("%d %s counters detected\n", pmu->num_counters,
+ pmu->pmu.name);
+
+ index++;
+ }
+ }
+
+done:
+ uncore->num_pmus = index;
+ uncore->init_done = true;
+
+ return amd_uncore_ctx_init(uncore, cpu);
+}
+
+static struct amd_uncore uncores[UNCORE_TYPE_MAX] = {
+ /* UNCORE_TYPE_DF */
+ {
+ .scan = amd_uncore_df_ctx_scan,
+ .init = amd_uncore_df_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+ /* UNCORE_TYPE_L3 */
+ {
+ .scan = amd_uncore_l3_ctx_scan,
+ .init = amd_uncore_l3_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+ /* UNCORE_TYPE_UMC */
+ {
+ .scan = amd_uncore_umc_ctx_scan,
+ .init = amd_uncore_umc_ctx_init,
+ .move = amd_uncore_ctx_move,
+ .free = amd_uncore_ctx_free,
+ },
+};
+
+static int __init amd_uncore_init(void)
+{
+ struct amd_uncore *uncore;
int ret = -ENODEV;
+ int i;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -660,125 +1022,91 @@ static int __init amd_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_PERFMON_V2))
pmu_version = 2;
- num_counters_nb = NUM_COUNTERS_NB;
- num_counters_llc = NUM_COUNTERS_L2;
- if (boot_cpu_data.x86 >= 0x17) {
- /*
- * For F17h and above, the Northbridge counters are
- * repurposed as Data Fabric counters. Also, L3
- * counters are supported too. The PMUs are exported
- * based on family as either L2 or L3 and NB or DF.
- */
- num_counters_llc = NUM_COUNTERS_L3;
- amd_nb_pmu.name = "amd_df";
- amd_llc_pmu.name = "amd_l3";
- l3_mask = true;
- }
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
- if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
- if (pmu_version >= 2) {
- *df_attr++ = &format_attr_event14v2.attr;
- *df_attr++ = &format_attr_umask12.attr;
- } else if (boot_cpu_data.x86 >= 0x17) {
- *df_attr = &format_attr_event14.attr;
- }
+ BUG_ON(!uncore->scan);
+ BUG_ON(!uncore->init);
+ BUG_ON(!uncore->move);
+ BUG_ON(!uncore->free);
- amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- if (!amd_uncore_nb) {
+ uncore->info = alloc_percpu(union amd_uncore_info);
+ if (!uncore->info) {
ret = -ENOMEM;
- goto fail_nb;
- }
- ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
- if (ret)
- goto fail_nb;
-
- if (pmu_version >= 2) {
- ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
- num_counters_nb = ebx.split.num_df_pmc;
- }
-
- pr_info("%d %s %s counters detected\n", num_counters_nb,
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "",
- amd_nb_pmu.name);
-
- ret = 0;
- }
-
- if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
- if (boot_cpu_data.x86 >= 0x19) {
- *l3_attr++ = &format_attr_event8.attr;
- *l3_attr++ = &format_attr_umask8.attr;
- *l3_attr++ = &format_attr_threadmask2.attr;
- } else if (boot_cpu_data.x86 >= 0x17) {
- *l3_attr++ = &format_attr_event8.attr;
- *l3_attr++ = &format_attr_umask8.attr;
- *l3_attr++ = &format_attr_threadmask8.attr;
- }
-
- amd_uncore_llc = alloc_percpu(struct amd_uncore *);
- if (!amd_uncore_llc) {
- ret = -ENOMEM;
- goto fail_llc;
+ goto fail;
}
- ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
- if (ret)
- goto fail_llc;
-
- pr_info("%d %s %s counters detected\n", num_counters_llc,
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "",
- amd_llc_pmu.name);
- ret = 0;
- }
+ };
/*
* Install callbacks. Core will call them for each online cpu.
*/
- if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
- "perf/x86/amd/uncore:prepare",
- amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
- goto fail_llc;
-
- if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
- "perf/x86/amd/uncore:starting",
- amd_uncore_cpu_starting, NULL))
+ ret = cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
+ "perf/x86/amd/uncore:prepare",
+ NULL, amd_uncore_cpu_dead);
+ if (ret)
+ goto fail;
+
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
+ "perf/x86/amd/uncore:starting",
+ amd_uncore_cpu_starting, NULL);
+ if (ret)
goto fail_prep;
- if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
- "perf/x86/amd/uncore:online",
- amd_uncore_cpu_online,
- amd_uncore_cpu_down_prepare))
+
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ "perf/x86/amd/uncore:online",
+ amd_uncore_cpu_online,
+ amd_uncore_cpu_down_prepare);
+ if (ret)
goto fail_start;
+
return 0;
fail_start:
cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
fail_prep:
cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
-fail_llc:
- if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
- perf_pmu_unregister(&amd_nb_pmu);
- free_percpu(amd_uncore_llc);
-fail_nb:
- free_percpu(amd_uncore_nb);
+fail:
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (uncore->info) {
+ free_percpu(uncore->info);
+ uncore->info = NULL;
+ }
+ }
return ret;
}
static void __exit amd_uncore_exit(void)
{
+ struct amd_uncore *uncore;
+ struct amd_uncore_pmu *pmu;
+ int i, j;
+
cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE);
cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
- if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
- perf_pmu_unregister(&amd_llc_pmu);
- free_percpu(amd_uncore_llc);
- amd_uncore_llc = NULL;
- }
+ for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+ uncore = &uncores[i];
+ if (!uncore->info)
+ continue;
+
+ free_percpu(uncore->info);
+ uncore->info = NULL;
+
+ for (j = 0; j < uncore->num_pmus; j++) {
+ pmu = &uncore->pmus[j];
+ if (!pmu->ctx)
+ continue;
+
+ perf_pmu_unregister(&pmu->pmu);
+ free_percpu(pmu->ctx);
+ pmu->ctx = NULL;
+ }
- if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
- perf_pmu_unregister(&amd_nb_pmu);
- free_percpu(amd_uncore_nb);
- amd_uncore_nb = NULL;
+ kfree(uncore->pmus);
+ uncore->pmus = NULL;
}
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 185f902e5..5b0dd07b1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -601,7 +601,7 @@ int x86_pmu_hw_config(struct perf_event *event)
}
}
- if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ if (branch_sample_call_stack(event))
event->attach_state |= PERF_ATTACH_TASK_DATA;
/*
@@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
while (++i < cpuc->n_events) {
cpuc->event_list[i-1] = cpuc->event_list[i];
cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+ cpuc->assign[i-1] = cpuc->assign[i];
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
@@ -1702,7 +1703,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -1887,9 +1888,9 @@ ssize_t events_hybrid_sysfs_show(struct device *dev,
str = pmu_attr->event_str;
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
- if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+ if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
continue;
- if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+ if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
next_str = strchr(str, ';');
if (next_str)
return snprintf(page, next_str - str + 1, "%s", str);
@@ -2169,7 +2170,7 @@ static int __init init_hw_perf_events(void)
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
- (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
+ (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index bc4fcf0d9..3804f21ab 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -211,6 +211,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_grt_event_constraints[] __read_mostly = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ EVENT_CONSTRAINT_END
+};
+
static struct event_constraint intel_skl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -299,7 +307,7 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
-static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+static struct extra_reg intel_glc_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
@@ -309,11 +317,12 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
-static struct event_constraint intel_spr_event_constraints[] = {
+static struct event_constraint intel_glc_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
@@ -349,7 +358,7 @@ static struct event_constraint intel_spr_event_constraints[] = {
EVENT_CONSTRAINT_END
};
-static struct extra_reg intel_gnr_extra_regs[] __read_mostly = {
+static struct extra_reg intel_rwc_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
@@ -473,7 +482,7 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
-static __initconst const u64 spr_hw_cache_event_ids
+static __initconst const u64 glc_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -552,7 +561,7 @@ static __initconst const u64 spr_hw_cache_event_ids
},
};
-static __initconst const u64 spr_hw_cache_extra_regs
+static __initconst const u64 glc_hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -2518,9 +2527,14 @@ static void intel_pmu_assign_event(struct perf_event *event, int idx)
perf_report_aux_output_id(event, idx);
}
+static __always_inline bool intel_pmu_needs_branch_stack(struct perf_event *event)
+{
+ return event->hw.flags & PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+}
+
static void intel_pmu_del_event(struct perf_event *event)
{
- if (needs_branch_stack(event))
+ if (intel_pmu_needs_branch_stack(event))
intel_pmu_lbr_del(event);
if (event->attr.precise_ip)
intel_pmu_pebs_del(event);
@@ -2556,16 +2570,6 @@ static int icl_set_topdown_event_period(struct perf_event *event)
return 0;
}
-static int adl_set_topdown_event_period(struct perf_event *event)
-{
- struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-
- if (pmu->cpu_type != hybrid_big)
- return 0;
-
- return icl_set_topdown_event_period(event);
-}
-
DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period);
static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
@@ -2708,16 +2712,6 @@ static u64 icl_update_topdown_event(struct perf_event *event)
x86_pmu.num_topdown_events - 1);
}
-static u64 adl_update_topdown_event(struct perf_event *event)
-{
- struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-
- if (pmu->cpu_type != hybrid_big)
- return 0;
-
- return icl_update_topdown_event(event);
-}
-
DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
static void intel_pmu_read_topdown_event(struct perf_event *event)
@@ -2798,6 +2792,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
static void intel_pmu_enable_event(struct perf_event *event)
{
+ u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
@@ -2806,8 +2801,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
switch (idx) {
case 0 ... INTEL_PMC_IDX_FIXED - 1:
+ if (branch_sample_counters(event))
+ enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
intel_set_masks(event, idx);
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ __x86_pmu_enable_event(hwc, enable_mask);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
@@ -2831,7 +2828,7 @@ static void intel_pmu_add_event(struct perf_event *event)
{
if (event->attr.precise_ip)
intel_pmu_pebs_add(event);
- if (needs_branch_stack(event))
+ if (intel_pmu_needs_branch_stack(event))
intel_pmu_lbr_add(event);
}
@@ -3058,7 +3055,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ intel_pmu_lbr_save_brstack(&data, cpuc, event);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -3623,6 +3620,13 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
if (cpuc->excl_cntrs)
return intel_get_excl_constraints(cpuc, event, idx, c2);
+ /* Not all counters support the branch counter feature. */
+ if (branch_sample_counters(event)) {
+ c2 = dyn_constraint(cpuc, c2, idx);
+ c2->idxmsk64 &= x86_pmu.lbr_counters;
+ c2->weight = hweight64(c2->idxmsk64);
+ }
+
return c2;
}
@@ -3869,7 +3873,7 @@ static inline bool require_mem_loads_aux_event(struct perf_event *event)
return false;
if (is_hybrid())
- return hybrid_pmu(event->pmu)->cpu_type == hybrid_big;
+ return hybrid_pmu(event->pmu)->pmu_type == hybrid_big;
return true;
}
@@ -3908,7 +3912,62 @@ static int intel_pmu_hw_config(struct perf_event *event)
x86_pmu.pebs_aliases(event);
}
- if (needs_branch_stack(event)) {
+ if (needs_branch_stack(event) && is_sampling_event(event))
+ event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+
+ if (branch_sample_counters(event)) {
+ struct perf_event *leader, *sibling;
+ int num = 0;
+
+ if (!(x86_pmu.flags & PMU_FL_BR_CNTR) ||
+ (event->attr.config & ~INTEL_ARCH_EVENT_MASK))
+ return -EINVAL;
+
+ /*
+ * The branch counter logging is not supported in the call stack
+ * mode yet, since we cannot simply flush the LBR during e.g.,
+ * multiplexing. Also, there is no obvious usage with the call
+ * stack mode. Simply forbids it for now.
+ *
+ * If any events in the group enable the branch counter logging
+ * feature, the group is treated as a branch counter logging
+ * group, which requires the extra space to store the counters.
+ */
+ leader = event->group_leader;
+ if (branch_sample_call_stack(leader))
+ return -EINVAL;
+ if (branch_sample_counters(leader))
+ num++;
+ leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS;
+
+ for_each_sibling_event(sibling, leader) {
+ if (branch_sample_call_stack(sibling))
+ return -EINVAL;
+ if (branch_sample_counters(sibling))
+ num++;
+ }
+
+ if (num > fls(x86_pmu.lbr_counters))
+ return -EINVAL;
+ /*
+ * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't
+ * require any branch stack setup.
+ * Clear the bit to avoid unnecessary branch stack setup.
+ */
+ if (0 == (event->attr.branch_sample_type &
+ ~(PERF_SAMPLE_BRANCH_PLM_ALL |
+ PERF_SAMPLE_BRANCH_COUNTERS)))
+ event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+
+ /*
+ * Force the leader to be a LBR event. So LBRs can be reset
+ * with the leader event. See intel_pmu_lbr_del() for details.
+ */
+ if (!intel_pmu_needs_branch_stack(leader))
+ return -EINVAL;
+ }
+
+ if (intel_pmu_needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
@@ -4038,7 +4097,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
/*
* Currently, the only caller of this function is the atomic_switch_perf_msrs().
- * The host perf conext helps to prepare the values of the real hardware for
+ * The host perf context helps to prepare the values of the real hardware for
* a set of msrs that need to be switched atomically in a vmx transaction.
*
* For example, the pseudocode needed to add a new msr should look like:
@@ -4278,7 +4337,7 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
static struct event_constraint *
-spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+glc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
struct event_constraint *c;
@@ -4366,9 +4425,9 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- if (pmu->cpu_type == hybrid_big)
- return spr_get_event_constraints(cpuc, idx, event);
- else if (pmu->cpu_type == hybrid_small)
+ if (pmu->pmu_type == hybrid_big)
+ return glc_get_event_constraints(cpuc, idx, event);
+ else if (pmu->pmu_type == hybrid_small)
return tnt_get_event_constraints(cpuc, idx, event);
WARN_ON(1);
@@ -4391,8 +4450,13 @@ cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
*/
if (event->attr.precise_ip == 3) {
/* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */
- if (constraint_match(&fixed0_constraint, event->hw.config))
- return &fixed0_counter0_1_constraint;
+ if (constraint_match(&fixed0_constraint, event->hw.config)) {
+ /* The fixed counter 0 doesn't support LBR event logging. */
+ if (branch_sample_counters(event))
+ return &counter0_1_constraint;
+ else
+ return &fixed0_counter0_1_constraint;
+ }
switch (c->idxmsk64 & 0x3ull) {
case 0x1:
@@ -4414,7 +4478,7 @@ rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
{
struct event_constraint *c;
- c = spr_get_event_constraints(cpuc, idx, event);
+ c = glc_get_event_constraints(cpuc, idx, event);
/* The Retire Latency is not supported by the fixed counter 0. */
if (event->attr.precise_ip &&
@@ -4438,9 +4502,9 @@ mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- if (pmu->cpu_type == hybrid_big)
+ if (pmu->pmu_type == hybrid_big)
return rwc_get_event_constraints(cpuc, idx, event);
- if (pmu->cpu_type == hybrid_small)
+ if (pmu->pmu_type == hybrid_small)
return cmt_get_event_constraints(cpuc, idx, event);
WARN_ON(1);
@@ -4451,18 +4515,18 @@ static int adl_hw_config(struct perf_event *event)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- if (pmu->cpu_type == hybrid_big)
+ if (pmu->pmu_type == hybrid_big)
return hsw_hw_config(event);
- else if (pmu->cpu_type == hybrid_small)
+ else if (pmu->pmu_type == hybrid_small)
return intel_pmu_hw_config(event);
WARN_ON(1);
return -EOPNOTSUPP;
}
-static u8 adl_get_hybrid_cpu_type(void)
+static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
{
- return hybrid_big;
+ return HYBRID_INTEL_CORE;
}
/*
@@ -4495,7 +4559,7 @@ static void nhm_limit_period(struct perf_event *event, s64 *left)
*left = max(*left, 32LL);
}
-static void spr_limit_period(struct perf_event *event, s64 *left)
+static void glc_limit_period(struct perf_event *event, s64 *left)
{
if (event->attr.precise_ip == 3)
*left = max(*left, 128LL);
@@ -4571,7 +4635,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
goto err;
}
- if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) {
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
@@ -4623,6 +4687,23 @@ static void intel_pmu_check_num_counters(int *num_counters,
int *num_counters_fixed,
u64 *intel_ctrl, u64 fixed_mask);
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+ int num_counters,
+ int num_counters_fixed,
+ u64 intel_ctrl);
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
+
+static inline bool intel_pmu_broken_perf_cap(void)
+{
+ /* The Perf Metric (Bit 15) is always cleared */
+ if ((boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE) ||
+ (boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE_L))
+ return true;
+
+ return false;
+}
+
static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
{
unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF);
@@ -4633,27 +4714,83 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
&eax, &ebx, &ecx, &edx);
pmu->num_counters = fls(eax);
pmu->num_counters_fixed = fls(ebx);
- intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
- &pmu->intel_ctrl, ebx);
+ }
+
+
+ if (!intel_pmu_broken_perf_cap()) {
+ /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities);
}
}
-static bool init_hybrid_pmu(int cpu)
+static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
+{
+ intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
+ &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1);
+ pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+
+ if (pmu->intel_cap.perf_metrics)
+ pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ else
+ pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+
+ if (pmu->intel_cap.pebs_output_pt_available)
+ pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+ else
+ pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT;
+
+ intel_pmu_check_event_constraints(pmu->event_constraints,
+ pmu->num_counters,
+ pmu->num_counters_fixed,
+ pmu->intel_ctrl);
+
+ intel_pmu_check_extra_regs(pmu->extra_regs);
+}
+
+static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
u8 cpu_type = get_this_hybrid_cpu_type();
- struct x86_hybrid_pmu *pmu = NULL;
int i;
- if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
- cpu_type = x86_pmu.get_hybrid_cpu_type();
+ /*
+ * This is running on a CPU model that is known to have hybrid
+ * configurations. But the CPU told us it is not hybrid, shame
+ * on it. There should be a fixup function provided for these
+ * troublesome CPUs (->get_hybrid_cpu_type).
+ */
+ if (cpu_type == HYBRID_INTEL_NONE) {
+ if (x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+ else
+ return NULL;
+ }
+ /*
+ * This essentially just maps between the 'hybrid_cpu_type'
+ * and 'hybrid_pmu_type' enums:
+ */
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
- if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) {
- pmu = &x86_pmu.hybrid_pmu[i];
- break;
- }
+ enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
+
+ if (cpu_type == HYBRID_INTEL_CORE &&
+ pmu_type == hybrid_big)
+ return &x86_pmu.hybrid_pmu[i];
+ if (cpu_type == HYBRID_INTEL_ATOM &&
+ pmu_type == hybrid_small)
+ return &x86_pmu.hybrid_pmu[i];
}
+
+ return NULL;
+}
+
+static bool init_hybrid_pmu(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct x86_hybrid_pmu *pmu = find_hybrid_pmu_for_cpu();
+
if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) {
cpuc->pmu = NULL;
return false;
@@ -4666,6 +4803,8 @@ static bool init_hybrid_pmu(int cpu)
if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
update_pmu_cap(pmu);
+ intel_pmu_check_hybrid_pmus(pmu);
+
if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
return false;
@@ -5342,14 +5481,14 @@ static struct attribute *icl_tsx_events_attrs[] = {
EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
-static struct attribute *spr_events_attrs[] = {
+static struct attribute *glc_events_attrs[] = {
EVENT_PTR(mem_ld_hsw),
EVENT_PTR(mem_st_spr),
EVENT_PTR(mem_ld_aux),
NULL,
};
-static struct attribute *spr_td_events_attrs[] = {
+static struct attribute *glc_td_events_attrs[] = {
EVENT_PTR(slots),
EVENT_PTR(td_retiring),
EVENT_PTR(td_bad_spec),
@@ -5362,7 +5501,7 @@ static struct attribute *spr_td_events_attrs[] = {
NULL,
};
-static struct attribute *spr_tsx_events_attrs[] = {
+static struct attribute *glc_tsx_events_attrs[] = {
EVENT_PTR(tx_start),
EVENT_PTR(tx_abort),
EVENT_PTR(tx_commit),
@@ -5468,11 +5607,41 @@ static ssize_t branches_show(struct device *cdev,
static DEVICE_ATTR_RO(branches);
+static ssize_t branch_counter_nr_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", fls(x86_pmu.lbr_counters));
+}
+
+static DEVICE_ATTR_RO(branch_counter_nr);
+
+static ssize_t branch_counter_width_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", LBR_INFO_BR_CNTR_BITS);
+}
+
+static DEVICE_ATTR_RO(branch_counter_width);
+
static struct attribute *lbr_attrs[] = {
&dev_attr_branches.attr,
+ &dev_attr_branch_counter_nr.attr,
+ &dev_attr_branch_counter_width.attr,
NULL
};
+static umode_t
+lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ /* branches */
+ if (i == 0)
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+
+ return (x86_pmu.flags & PMU_FL_BR_CNTR) ? attr->mode : 0;
+}
+
static char pmu_name_str[30];
static ssize_t pmu_name_show(struct device *cdev,
@@ -5500,6 +5669,15 @@ static struct attribute *intel_pmu_attrs[] = {
};
static umode_t
+default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &dev_attr_allow_tsx_force_abort.attr)
+ return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
+
+ return attr->mode;
+}
+
+static umode_t
tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0;
@@ -5521,26 +5699,11 @@ mem_is_visible(struct kobject *kobj, struct attribute *attr, int i)
}
static umode_t
-lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return x86_pmu.lbr_nr ? attr->mode : 0;
-}
-
-static umode_t
exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.version >= 2 ? attr->mode : 0;
}
-static umode_t
-default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- if (attr == &dev_attr_allow_tsx_force_abort.attr)
- return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
-
- return attr->mode;
-}
-
static struct attribute_group group_events_td = {
.name = "events",
};
@@ -5704,7 +5867,7 @@ static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
struct perf_pmu_events_hybrid_attr *pmu_attr =
container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr);
- return pmu->cpu_type & pmu_attr->pmu_type;
+ return pmu->pmu_type & pmu_attr->pmu_type;
}
static umode_t hybrid_events_is_visible(struct kobject *kobj,
@@ -5741,7 +5904,7 @@ static umode_t hybrid_format_is_visible(struct kobject *kobj,
container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr);
int cpu = hybrid_find_supported_cpu(pmu);
- return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0;
+ return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0;
}
static struct attribute_group hybrid_group_events_td = {
@@ -5885,40 +6048,105 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
}
}
-static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
+static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = {
+ { hybrid_small, "cpu_atom" },
+ { hybrid_big, "cpu_core" },
+};
+
+static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
{
+ unsigned long pmus_mask = pmus;
struct x86_hybrid_pmu *pmu;
- int i;
+ int idx = 0, bit;
- for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
- pmu = &x86_pmu.hybrid_pmu[i];
+ x86_pmu.num_hybrid_pmus = hweight_long(pmus_mask);
+ x86_pmu.hybrid_pmu = kcalloc(x86_pmu.num_hybrid_pmus,
+ sizeof(struct x86_hybrid_pmu),
+ GFP_KERNEL);
+ if (!x86_pmu.hybrid_pmu)
+ return -ENOMEM;
- intel_pmu_check_num_counters(&pmu->num_counters,
- &pmu->num_counters_fixed,
- &pmu->intel_ctrl,
- fixed_mask);
+ static_branch_enable(&perf_is_hybrid);
+ x86_pmu.filter = intel_pmu_filter;
- if (pmu->intel_cap.perf_metrics) {
- pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
- pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS;
+ for_each_set_bit(bit, &pmus_mask, ARRAY_SIZE(intel_hybrid_pmu_type_map)) {
+ pmu = &x86_pmu.hybrid_pmu[idx++];
+ pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id;
+ pmu->name = intel_hybrid_pmu_type_map[bit].name;
+
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ if (pmu->pmu_type & hybrid_small) {
+ pmu->intel_cap.perf_metrics = 0;
+ pmu->intel_cap.pebs_output_pt_available = 1;
+ pmu->mid_ack = true;
+ } else if (pmu->pmu_type & hybrid_big) {
+ pmu->intel_cap.perf_metrics = 1;
+ pmu->intel_cap.pebs_output_pt_available = 0;
+ pmu->late_ack = true;
}
+ }
- if (pmu->intel_cap.pebs_output_pt_available)
- pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+ return 0;
+}
- intel_pmu_check_event_constraints(pmu->event_constraints,
- pmu->num_counters,
- pmu->num_counters_fixed,
- pmu->intel_ctrl);
+static __always_inline void intel_pmu_ref_cycles_ext(void)
+{
+ if (!(x86_pmu.events_maskl & (INTEL_PMC_MSK_FIXED_REF_CYCLES >> INTEL_PMC_IDX_FIXED)))
+ intel_perfmon_event_map[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x013c;
+}
- intel_pmu_check_extra_regs(pmu->extra_regs);
- }
+static __always_inline void intel_pmu_init_glc(struct pmu *pmu)
+{
+ x86_pmu.late_ack = true;
+ x86_pmu.limit_period = glc_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.num_topdown_events = 8;
+ static_call_update(intel_pmu_update_topdown_event,
+ &icl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &icl_set_topdown_event_period);
+
+ memcpy(hybrid_var(pmu, hw_cache_event_ids), glc_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hybrid_var(pmu, hw_cache_extra_regs), glc_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ hybrid(pmu, event_constraints) = intel_glc_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_glc_pebs_event_constraints;
+
+ intel_pmu_ref_cycles_ext();
}
-static __always_inline bool is_mtl(u8 x86_model)
+static __always_inline void intel_pmu_init_grt(struct pmu *pmu)
{
- return (x86_model == INTEL_FAM6_METEORLAKE) ||
- (x86_model == INTEL_FAM6_METEORLAKE_L);
+ x86_pmu.mid_ack = true;
+ x86_pmu.limit_period = glc_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+ memcpy(hybrid_var(pmu, hw_cache_event_ids), glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hybrid_var(pmu, hw_cache_extra_regs), tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ hybrid_var(pmu, hw_cache_event_ids)[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+ hybrid(pmu, event_constraints) = intel_grt_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_grt_pebs_event_constraints;
+ hybrid(pmu, extra_regs) = intel_grt_extra_regs;
+
+ intel_pmu_ref_cycles_ext();
}
__init int intel_pmu_init(void)
@@ -6199,28 +6427,10 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GRACEMONT:
- x86_pmu.mid_ack = true;
- memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
- hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
-
- x86_pmu.event_constraints = intel_slm_event_constraints;
- x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
- x86_pmu.extra_regs = intel_grt_extra_regs;
-
- x86_pmu.pebs_aliases = NULL;
- x86_pmu.pebs_prec_dist = true;
- x86_pmu.pebs_block = true;
- x86_pmu.lbr_pt_coexist = true;
- x86_pmu.flags |= PMU_FL_HAS_RSP_1;
- x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
+ intel_pmu_init_grt(NULL);
intel_pmu_pebs_data_source_grt();
x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.get_event_constraints = tnt_get_event_constraints;
- x86_pmu.limit_period = spr_limit_period;
td_attr = tnt_events_attrs;
mem_attr = grt_mem_attrs;
extra_attr = nhm_format_attr;
@@ -6230,28 +6440,11 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_CRESTMONT:
case INTEL_FAM6_ATOM_CRESTMONT_X:
- x86_pmu.mid_ack = true;
- memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
- hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
-
- x86_pmu.event_constraints = intel_slm_event_constraints;
- x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
+ intel_pmu_init_grt(NULL);
x86_pmu.extra_regs = intel_cmt_extra_regs;
-
- x86_pmu.pebs_aliases = NULL;
- x86_pmu.pebs_prec_dist = true;
- x86_pmu.lbr_pt_coexist = true;
- x86_pmu.pebs_block = true;
- x86_pmu.flags |= PMU_FL_HAS_RSP_1;
- x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
intel_pmu_pebs_data_source_cmt();
x86_pmu.pebs_latency_data = mtl_latency_data_small;
x86_pmu.get_event_constraints = cmt_get_event_constraints;
- x86_pmu.limit_period = spr_limit_period;
td_attr = cmt_events_attrs;
mem_attr = grt_mem_attrs;
extra_attr = cmt_format_attr;
@@ -6568,44 +6761,23 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_EMERALDRAPIDS_X:
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
- x86_pmu.extra_regs = intel_spr_extra_regs;
+ x86_pmu.extra_regs = intel_glc_extra_regs;
fallthrough;
case INTEL_FAM6_GRANITERAPIDS_X:
case INTEL_FAM6_GRANITERAPIDS_D:
- pmem = true;
- x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
- x86_pmu.event_constraints = intel_spr_event_constraints;
- x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+ intel_pmu_init_glc(NULL);
if (!x86_pmu.extra_regs)
- x86_pmu.extra_regs = intel_gnr_extra_regs;
- x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.extra_regs = intel_rwc_extra_regs;
x86_pmu.pebs_ept = 1;
- x86_pmu.pebs_aliases = NULL;
- x86_pmu.pebs_prec_dist = true;
- x86_pmu.pebs_block = true;
- x86_pmu.flags |= PMU_FL_HAS_RSP_1;
- x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
- x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
x86_pmu.hw_config = hsw_hw_config;
- x86_pmu.get_event_constraints = spr_get_event_constraints;
+ x86_pmu.get_event_constraints = glc_get_event_constraints;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
extra_skl_attr = skl_format_attr;
- mem_attr = spr_events_attrs;
- td_attr = spr_td_events_attrs;
- tsx_attr = spr_tsx_events_attrs;
- x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
- x86_pmu.lbr_pt_coexist = true;
- intel_pmu_pebs_data_source_skl(pmem);
- x86_pmu.num_topdown_events = 8;
- static_call_update(intel_pmu_update_topdown_event,
- &icl_update_topdown_event);
- static_call_update(intel_pmu_set_topdown_event_period,
- &icl_set_topdown_event_period);
+ mem_attr = glc_events_attrs;
+ td_attr = glc_td_events_attrs;
+ tsx_attr = glc_tsx_events_attrs;
+ intel_pmu_pebs_data_source_skl(true);
pr_cont("Sapphire Rapids events, ");
name = "sapphire_rapids";
break;
@@ -6615,47 +6787,17 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
- case INTEL_FAM6_METEORLAKE:
- case INTEL_FAM6_METEORLAKE_L:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
* Initialize the common PerfMon capabilities here.
*/
- x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS,
- sizeof(struct x86_hybrid_pmu),
- GFP_KERNEL);
- if (!x86_pmu.hybrid_pmu)
- return -ENOMEM;
- static_branch_enable(&perf_is_hybrid);
- x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS;
+ intel_pmu_init_hybrid(hybrid_big_small);
- x86_pmu.pebs_aliases = NULL;
- x86_pmu.pebs_prec_dist = true;
- x86_pmu.pebs_block = true;
- x86_pmu.flags |= PMU_FL_HAS_RSP_1;
- x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
- x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
- x86_pmu.lbr_pt_coexist = true;
x86_pmu.pebs_latency_data = adl_latency_data_small;
- x86_pmu.num_topdown_events = 8;
- static_call_update(intel_pmu_update_topdown_event,
- &adl_update_topdown_event);
- static_call_update(intel_pmu_set_topdown_event_period,
- &adl_set_topdown_event_period);
-
- x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
- x86_pmu.limit_period = spr_limit_period;
x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
- /*
- * The rtm_abort_event is used to check whether to enable GPRs
- * for the RTM abort event. Atom doesn't have the RTM abort
- * event. There is no harmful to set it in the common
- * x86_pmu.rtm_abort_event.
- */
- x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
td_attr = adl_hybrid_events_attrs;
mem_attr = adl_hybrid_mem_attrs;
@@ -6665,9 +6807,7 @@ __init int intel_pmu_init(void)
/* Initialize big core specific PerfMon capabilities.*/
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
- pmu->name = "cpu_core";
- pmu->cpu_type = hybrid_big;
- pmu->late_ack = true;
+ intel_pmu_init_glc(&pmu->pmu);
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
pmu->num_counters = x86_pmu.num_counters + 2;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
@@ -6692,54 +6832,45 @@ __init int intel_pmu_init(void)
pmu->unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
0, pmu->num_counters, 0, 0);
- pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
- pmu->intel_cap.perf_metrics = 1;
- pmu->intel_cap.pebs_output_pt_available = 0;
+ pmu->extra_regs = intel_glc_extra_regs;
- memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
- memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
- pmu->event_constraints = intel_spr_event_constraints;
- pmu->pebs_constraints = intel_spr_pebs_event_constraints;
- pmu->extra_regs = intel_spr_extra_regs;
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_grt(&pmu->pmu);
+
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ intel_pmu_pebs_data_source_adl();
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ break;
+
+ case INTEL_FAM6_METEORLAKE:
+ case INTEL_FAM6_METEORLAKE_L:
+ intel_pmu_init_hybrid(hybrid_big_small);
+
+ x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+
+ td_attr = adl_hybrid_events_attrs;
+ mem_attr = mtl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_glc(&pmu->pmu);
+ pmu->extra_regs = intel_rwc_extra_regs;
/* Initialize Atom core specific PerfMon capabilities.*/
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
- pmu->name = "cpu_atom";
- pmu->cpu_type = hybrid_small;
- pmu->mid_ack = true;
- pmu->num_counters = x86_pmu.num_counters;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
- pmu->max_pebs_events = x86_pmu.max_pebs_events;
- pmu->unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
- 0, pmu->num_counters, 0, 0);
- pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
- pmu->intel_cap.perf_metrics = 0;
- pmu->intel_cap.pebs_output_pt_available = 1;
-
- memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
- memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
- pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
- pmu->event_constraints = intel_slm_event_constraints;
- pmu->pebs_constraints = intel_grt_pebs_event_constraints;
- pmu->extra_regs = intel_grt_extra_regs;
- if (is_mtl(boot_cpu_data.x86_model)) {
- x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs;
- x86_pmu.pebs_latency_data = mtl_latency_data_small;
- extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
- mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
- mem_attr = mtl_hybrid_mem_attrs;
- intel_pmu_pebs_data_source_mtl();
- x86_pmu.get_event_constraints = mtl_get_event_constraints;
- pmu->extra_regs = intel_cmt_extra_regs;
- pr_cont("Meteorlake Hybrid events, ");
- name = "meteorlake_hybrid";
- } else {
- x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
- intel_pmu_pebs_data_source_adl();
- pr_cont("Alderlake Hybrid events, ");
- name = "alderlake_hybrid";
- }
+ intel_pmu_init_grt(&pmu->pmu);
+ pmu->extra_regs = intel_cmt_extra_regs;
+
+ intel_pmu_pebs_data_source_mtl();
+ pr_cont("Meteorlake Hybrid events, ");
+ name = "meteorlake_hybrid";
break;
default:
@@ -6851,9 +6982,6 @@ __init int intel_pmu_init(void)
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
- if (is_hybrid())
- intel_pmu_check_hybrid_pmus((u64)fixed_mask);
-
if (x86_pmu.intel_cap.pebs_timing_info)
x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 96fffb2d5..4b50a3a98 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -41,7 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
- * MTL
+ * MTL,SRF,GRR
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -52,7 +52,8 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR,MTL
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
+ * GRR
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
@@ -75,7 +76,7 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR,MTL
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@@ -97,6 +98,10 @@
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
* TNT,RKL,ADL,RPL,MTL
* Scope: Package (physical package)
+ * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
+ * perf code: 0x00
+ * Available model: SRF,GRR
+ * Scope: A cluster of cores shared L2 cache
*
*/
@@ -130,6 +135,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct cstate_model {
unsigned long core_events;
unsigned long pkg_events;
+ unsigned long module_events;
unsigned long quirks;
};
@@ -189,20 +195,20 @@ static struct attribute *attrs_empty[] = {
* "events" group (with empty attrs) before updating
* it with detected events.
*/
-static struct attribute_group core_events_attr_group = {
+static struct attribute_group cstate_events_attr_group = {
.name = "events",
.attrs = attrs_empty,
};
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
- &format_attr_core_event.attr,
+DEFINE_CSTATE_FORMAT_ATTR(cstate_event, event, "config:0-63");
+static struct attribute *cstate_format_attrs[] = {
+ &format_attr_cstate_event.attr,
NULL,
};
-static struct attribute_group core_format_attr_group = {
+static struct attribute_group cstate_format_attr_group = {
.name = "format",
- .attrs = core_format_attrs,
+ .attrs = cstate_format_attrs,
};
static cpumask_t cstate_core_cpu_mask;
@@ -217,9 +223,9 @@ static struct attribute_group cpumask_attr_group = {
.attrs = cstate_cpumask_attrs,
};
-static const struct attribute_group *core_attr_groups[] = {
- &core_events_attr_group,
- &core_format_attr_group,
+static const struct attribute_group *cstate_attr_groups[] = {
+ &cstate_events_attr_group,
+ &cstate_format_attr_group,
&cpumask_attr_group,
NULL,
};
@@ -268,30 +274,30 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
-static struct attribute_group pkg_events_attr_group = {
- .name = "events",
- .attrs = attrs_empty,
-};
+static cpumask_t cstate_pkg_cpu_mask;
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
- &format_attr_pkg_event.attr,
- NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
- .name = "format",
- .attrs = pkg_format_attrs,
+/* cstate_module PMU */
+static struct pmu cstate_module_pmu;
+static bool has_cstate_module;
+
+enum perf_cstate_module_events {
+ PERF_CSTATE_MODULE_C6_RES = 0,
+
+ PERF_CSTATE_MODULE_EVENT_MAX,
};
-static cpumask_t cstate_pkg_cpu_mask;
+PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_module_c6, "event=0x00");
-static const struct attribute_group *pkg_attr_groups[] = {
- &pkg_events_attr_group,
- &pkg_format_attr_group,
- &cpumask_attr_group,
- NULL,
+static unsigned long module_msr_mask;
+
+PMU_EVENT_GROUP(events, cstate_module_c6);
+
+static struct perf_msr module_msr[] = {
+ [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
+static cpumask_t cstate_module_cpu_mask;
+
static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -302,6 +308,8 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
else if (pmu == &cstate_pkg_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+ else if (pmu == &cstate_module_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
else
return 0;
}
@@ -336,9 +344,21 @@ static int cstate_pmu_event_init(struct perf_event *event)
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
event->hw.event_base = pkg_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(event->cpu));
+ } else if (event->pmu == &cstate_module_pmu) {
+ if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX);
+ if (!(module_msr_mask & (1 << cfg)))
+ return -EINVAL;
+ event->hw.event_base = module_msr[cfg].msr;
+ cpu = cpumask_any_and(&cstate_module_cpu_mask,
+ topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
@@ -426,6 +446,17 @@ static int cstate_cpu_exit(unsigned int cpu)
perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
}
}
+
+ if (has_cstate_module &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
+
+ target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &cstate_module_cpu_mask);
+ perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
+ }
+ }
return 0;
}
@@ -452,6 +483,15 @@ static int cstate_cpu_init(unsigned int cpu)
if (has_cstate_pkg && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+ /*
+ * If this is the first online thread of that cluster, set it
+ * in the cluster cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_module_cpu_mask,
+ topology_cluster_cpumask(cpu));
+ if (has_cstate_module && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
+
return 0;
}
@@ -474,8 +514,13 @@ static const struct attribute_group *pkg_attr_update[] = {
NULL,
};
+static const struct attribute_group *module_attr_update[] = {
+ &group_cstate_module_c6,
+ NULL
+};
+
static struct pmu cstate_core_pmu = {
- .attr_groups = core_attr_groups,
+ .attr_groups = cstate_attr_groups,
.attr_update = core_attr_update,
.name = "cstate_core",
.task_ctx_nr = perf_invalid_context,
@@ -490,7 +535,7 @@ static struct pmu cstate_core_pmu = {
};
static struct pmu cstate_pkg_pmu = {
- .attr_groups = pkg_attr_groups,
+ .attr_groups = cstate_attr_groups,
.attr_update = pkg_attr_update,
.name = "cstate_pkg",
.task_ctx_nr = perf_invalid_context,
@@ -504,6 +549,21 @@ static struct pmu cstate_pkg_pmu = {
.module = THIS_MODULE,
};
+static struct pmu cstate_module_pmu = {
+ .attr_groups = cstate_attr_groups,
+ .attr_update = module_attr_update,
+ .name = "cstate_module",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .module = THIS_MODULE,
+};
+
static const struct cstate_model nhm_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
@@ -618,6 +678,22 @@ static const struct cstate_model glm_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model grr_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
+static const struct cstate_model srf_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates),
@@ -670,6 +746,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
@@ -711,10 +789,14 @@ static int __init cstate_probe(const struct cstate_model *cm)
pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX,
true, (void *) &cm->pkg_events);
+ module_msr_mask = perf_msr_probe(module_msr, PERF_CSTATE_MODULE_EVENT_MAX,
+ true, (void *) &cm->module_events);
+
has_cstate_core = !!core_msr_mask;
has_cstate_pkg = !!pkg_msr_mask;
+ has_cstate_module = !!module_msr_mask;
- return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+ return (has_cstate_core || has_cstate_pkg || has_cstate_module) ? 0 : -ENODEV;
}
static inline void cstate_cleanup(void)
@@ -727,6 +809,9 @@ static inline void cstate_cleanup(void)
if (has_cstate_pkg)
perf_pmu_unregister(&cstate_pkg_pmu);
+
+ if (has_cstate_module)
+ perf_pmu_unregister(&cstate_module_pmu);
}
static int __init cstate_init(void)
@@ -763,6 +848,16 @@ static int __init cstate_init(void)
return err;
}
}
+
+ if (has_cstate_module) {
+ err = perf_pmu_register(&cstate_module_pmu, cstate_module_pmu.name, -1);
+ if (err) {
+ has_cstate_module = false;
+ pr_info("Failed to register cstate cluster pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
return 0;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index eb8dd8b8a..807de7b59 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -261,7 +261,7 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
{
u64 val;
- WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
+ WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
dse &= PERF_PEBS_DATA_SOURCE_MASK;
val = hybrid_var(event->pmu, pebs_data_source)[dse];
@@ -1058,7 +1058,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
-struct event_constraint intel_spr_pebs_event_constraints[] = {
+struct event_constraint intel_glc_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
@@ -1236,11 +1236,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct pmu *pmu = event->pmu;
/*
- * Make sure we get updated with the first PEBS
- * event. It will trigger also during removal, but
- * that does not hurt:
+ * Make sure we get updated with the first PEBS event.
+ * During removal, ->pebs_data_cfg is still valid for
+ * the last PEBS event. Don't clear it.
*/
- if (cpuc->n_pebs == 1)
+ if ((cpuc->n_pebs == 1) && add)
cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW;
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
@@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event))
- perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
- perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+ intel_pmu_lbr_save_brstack(data, cpuc, event);
}
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index c3b0d15a9..78cd50841 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -676,6 +676,25 @@ void intel_pmu_lbr_del(struct perf_event *event)
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->pmu);
+
+ /*
+ * The logged occurrences information is only valid for the
+ * current LBR group. If another LBR group is scheduled in
+ * later, the information from the stale LBRs will be wrongly
+ * interpreted. Reset the LBRs here.
+ *
+ * Only clear once for a branch counter group with the leader
+ * event. Because
+ * - Cannot simply reset the LBRs with the !cpuc->lbr_users.
+ * Because it's possible that the last LBR user is not in a
+ * branch counter group, e.g., a branch_counters group +
+ * several normal LBR events.
+ * - The LBR reset can be done with any one of the events in a
+ * branch counter group, since they are always scheduled together.
+ * It's easy to force the leader event an LBR event.
+ */
+ if (is_branch_counters_group(event) && event == event->group_leader)
+ intel_pmu_lbr_reset();
}
static inline bool vlbr_exclude_host(void)
@@ -866,6 +885,8 @@ static __always_inline u16 get_lbr_cycles(u64 info)
return cycles;
}
+static_assert((64 - PERF_BRANCH_ENTRY_INFO_BITS_MAX) > LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS);
+
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
struct lbr_entry *entries)
{
@@ -898,11 +919,67 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
e->type = get_lbr_br_type(info);
+
+ /*
+ * Leverage the reserved field of cpuc->lbr_entries[i] to
+ * temporarily store the branch counters information.
+ * The later code will decide what content can be disclosed
+ * to the perf tool. Pleae see intel_pmu_lbr_counters_reorder().
+ */
+ e->reserved = (info >> LBR_INFO_BR_CNTR_OFFSET) & LBR_INFO_BR_CNTR_FULL_MASK;
}
cpuc->lbr_stack.nr = i;
}
+/*
+ * The enabled order may be different from the counter order.
+ * Update the lbr_counters with the enabled order.
+ */
+static void intel_pmu_lbr_counters_reorder(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ int i, j, pos = 0, order[X86_PMC_IDX_MAX];
+ struct perf_event *leader, *sibling;
+ u64 src, dst, cnt;
+
+ leader = event->group_leader;
+ if (branch_sample_counters(leader))
+ order[pos++] = leader->hw.idx;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!branch_sample_counters(sibling))
+ continue;
+ order[pos++] = sibling->hw.idx;
+ }
+
+ WARN_ON_ONCE(!pos);
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ src = cpuc->lbr_entries[i].reserved;
+ dst = 0;
+ for (j = 0; j < pos; j++) {
+ cnt = (src >> (order[j] * LBR_INFO_BR_CNTR_BITS)) & LBR_INFO_BR_CNTR_MASK;
+ dst |= cnt << j * LBR_INFO_BR_CNTR_BITS;
+ }
+ cpuc->lbr_counters[i] = dst;
+ cpuc->lbr_entries[i].reserved = 0;
+ }
+}
+
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (is_branch_counters_group(event)) {
+ intel_pmu_lbr_counters_reorder(cpuc, event);
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, cpuc->lbr_counters);
+ return;
+ }
+
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
+}
+
static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
{
intel_pmu_store_lbr(cpuc, NULL);
@@ -1173,8 +1250,10 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
for (i = 0; i < cpuc->lbr_stack.nr; ) {
if (!cpuc->lbr_entries[i].from) {
j = i;
- while (++j < cpuc->lbr_stack.nr)
+ while (++j < cpuc->lbr_stack.nr) {
cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_counters[j-1] = cpuc->lbr_counters[j];
+ }
cpuc->lbr_stack.nr--;
if (!cpuc->lbr_entries[i].from)
continue;
@@ -1525,8 +1604,12 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+ x86_pmu.lbr_counters = ecx.split.lbr_counters;
x86_pmu.lbr_nr = lbr_nr;
+ if (!!x86_pmu.lbr_counters)
+ x86_pmu.flags |= PMU_FL_BR_CNTR;
+
if (x86_pmu.lbr_mispred)
static_branch_enable(&x86_lbr_mispred);
if (x86_pmu.lbr_timed_lbr)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 42a557940..8e2a12235 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -736,6 +736,7 @@ static bool topa_table_full(struct topa *topa)
/**
* topa_insert_pages() - create a list of ToPA tables
* @buf: PT buffer being initialized.
+ * @cpu: CPU on which to allocate.
* @gfp: Allocation flags.
*
* This initializes a list of ToPA tables with entries from
@@ -1207,8 +1208,11 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
/**
* pt_buffer_init_topa() - initialize ToPA table for pt buffer
* @buf: PT buffer.
- * @size: Total size of all regions within this ToPA.
+ * @cpu: CPU on which to allocate.
+ * @nr_pages: No. of pages to allocate.
* @gfp: Allocation flags.
+ *
+ * Return: 0 on success or error code.
*/
static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
unsigned long nr_pages, gfp_t gfp)
@@ -1281,7 +1285,7 @@ out:
/**
* pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu: Cpu on which to allocate, -1 means current.
+ * @event: Performance event
* @pages: Array of pointers to buffer pages passed from perf core.
* @nr_pages: Number of pages in the buffer.
* @snapshot: If this is a snapshot/overwrite counter.
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 69043e02e..7927c0b83 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -74,7 +74,7 @@ int uncore_device_to_die(struct pci_dev *dev)
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->initialized && cpu_to_node(cpu) == node)
- return c->logical_die_id;
+ return c->topo.logical_die_id;
}
return -1;
@@ -1814,6 +1814,14 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
.uncore_units_ignore = spr_uncore_units_ignore,
};
+static const struct intel_uncore_init_fun gnr_uncore_init __initconst = {
+ .cpu_init = gnr_uncore_cpu_init,
+ .pci_init = gnr_uncore_pci_init,
+ .mmio_init = gnr_uncore_mmio_init,
+ .use_discovery = true,
+ .uncore_units_ignore = gnr_uncore_units_ignore,
+};
+
static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
.cpu_init = intel_uncore_generic_uncore_cpu_init,
.pci_init = intel_uncore_generic_uncore_pci_init,
@@ -1865,8 +1873,12 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index c30fb5bb1..4838502d8 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -72,9 +72,9 @@ struct intel_uncore_type {
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
union {
- unsigned *msr_offsets;
- unsigned *pci_offsets;
- unsigned *mmio_offsets;
+ u64 *msr_offsets;
+ u64 *pci_offsets;
+ u64 *mmio_offsets;
};
unsigned *box_ids;
struct event_constraint unconstrainted;
@@ -593,6 +593,7 @@ extern struct list_head pci2phy_map_head;
extern struct pci_extra_dev *uncore_extra_pci_dev;
extern struct event_constraint uncore_constraint_empty;
extern int spr_uncore_units_ignore[];
+extern int gnr_uncore_units_ignore[];
/* uncore_snb.c */
int snb_uncore_pci_init(void);
@@ -634,6 +635,9 @@ void icx_uncore_mmio_init(void);
int spr_uncore_pci_init(void);
void spr_uncore_cpu_init(void);
void spr_uncore_mmio_init(void);
+int gnr_uncore_pci_init(void);
+void gnr_uncore_cpu_init(void);
+void gnr_uncore_mmio_init(void);
/* uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index cb488e418..9a698a929 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -125,7 +125,8 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
int die, bool parsed)
{
struct intel_uncore_discovery_type *type;
- unsigned int *box_offset, *ids;
+ unsigned int *ids;
+ u64 *box_offset;
int i;
if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
@@ -153,7 +154,7 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
if (!type)
return;
- box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL);
if (!box_offset)
return;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 6ee80ad34..22e769a81 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -125,7 +125,7 @@ struct intel_uncore_discovery_type {
u8 ctr_offset; /* Counter 0 offset */
u16 num_boxes; /* number of boxes for the uncore block */
unsigned int *ids; /* Box IDs */
- unsigned int *box_offset; /* Box offset */
+ u64 *box_offset; /* Box offset */
};
bool intel_uncore_has_discovery_tables(int *ignore);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 173e2674b..56eea2c66 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -306,7 +306,7 @@ static const struct attribute_group nhmex_uncore_cbox_format_group = {
};
/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
+static u64 nhmex_cbox_msr_offsets[] = {
0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 49bc27ab2..a96496bef 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1396,6 +1396,29 @@ err:
return ret;
}
+static int topology_gidnid_map(int nodeid, u32 gidnid)
+{
+ int i, die_id = -1;
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == GIDNIDMAP(gidnid, i)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ if (die_id < 0)
+ die_id = -ENODEV;
+ break;
+ }
+ }
+
+ return die_id;
+}
+
/*
* build pci bus to socket mapping
*/
@@ -1435,22 +1458,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
break;
}
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == GIDNIDMAP(config, i)) {
- if (topology_max_die_per_package() > 1)
- die_id = i;
- else
- die_id = topology_phys_to_logical_pkg(i);
- if (die_id < 0)
- die_id = -ENODEV;
- map->pbus_to_dieid[bus] = die_id;
- break;
- }
- }
+ map->pbus_to_dieid[bus] = topology_gidnid_map(nodeid, config);
raw_spin_unlock(&pci2phy_map_lock);
} else {
segment = pci_domain_nr(ubox_dev->bus);
@@ -5278,7 +5286,7 @@ void snr_uncore_mmio_init(void)
/* ICX uncore support */
-static unsigned icx_cha_msr_offsets[] = {
+static u64 icx_cha_msr_offsets[] = {
0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310,
0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e,
0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a,
@@ -5326,7 +5334,7 @@ static struct intel_uncore_type icx_uncore_chabox = {
.format_group = &snr_uncore_chabox_format_group,
};
-static unsigned icx_msr_offsets[] = {
+static u64 icx_msr_offsets[] = {
0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
};
@@ -5596,7 +5604,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i
struct pci_dev *ubox = NULL;
struct pci_dev *dev = NULL;
u32 nid, gid;
- int i, idx, lgc_pkg, ret = -EPERM;
+ int idx, lgc_pkg, ret = -EPERM;
struct intel_uncore_topology *upi;
unsigned int devfn;
@@ -5611,27 +5619,22 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i
break;
}
- for (i = 0; i < 8; i++) {
- if (nid != GIDNIDMAP(gid, i))
- continue;
- lgc_pkg = topology_phys_to_logical_pkg(i);
- if (lgc_pkg < 0) {
- ret = -EPERM;
- goto err;
- }
- for (idx = 0; idx < type->num_boxes; idx++) {
- upi = &type->topology[lgc_pkg][idx];
- devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
- dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
- ubox->bus->number,
- devfn);
- if (dev) {
- ret = upi_fill_topology(dev, upi, idx);
- if (ret)
- goto err;
- }
+ lgc_pkg = topology_gidnid_map(nid, gid);
+ if (lgc_pkg < 0) {
+ ret = -EPERM;
+ goto err;
+ }
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[lgc_pkg][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
}
- break;
}
}
err:
@@ -6085,13 +6088,16 @@ static struct uncore_event_desc spr_uncore_imc_events[] = {
{ /* end: all zeroes */ },
};
+#define SPR_UNCORE_MMIO_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_mmio_ops
+
static struct intel_uncore_type spr_uncore_imc = {
- SPR_UNCORE_COMMON_FORMAT(),
+ SPR_UNCORE_MMIO_COMMON_FORMAT(),
.name = "imc",
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
- .ops = &spr_uncore_mmio_ops,
.event_descs = spr_uncore_imc_events,
};
@@ -6187,7 +6193,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
*/
#define SPR_UNCORE_UPI_NUM_BOXES 4
-static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
+static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
0, 0x8000, 0x10000, 0x18000
};
@@ -6418,7 +6424,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
static struct intel_uncore_type **
uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
- struct intel_uncore_type **extra)
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores)
{
struct intel_uncore_type **types, **start_types;
int i;
@@ -6427,9 +6434,9 @@ uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
/* Only copy the customized features */
for (; *types; types++) {
- if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES)
+ if ((*types)->type_id >= max_num_types)
continue;
- uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
+ uncore_type_customized_copy(*types, uncores[(*types)->type_id]);
}
for (i = 0; i < num_extra; i++, types++)
@@ -6476,7 +6483,9 @@ void spr_uncore_cpu_init(void)
uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
UNCORE_SPR_MSR_EXTRA_UNCORES,
- spr_msr_uncores);
+ spr_msr_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA);
if (type) {
@@ -6558,7 +6567,9 @@ int spr_uncore_pci_init(void)
spr_update_device_location(UNCORE_SPR_M3UPI);
uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI,
UNCORE_SPR_PCI_EXTRA_UNCORES,
- spr_pci_uncores);
+ spr_pci_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
return 0;
}
@@ -6566,15 +6577,116 @@ void spr_uncore_mmio_init(void)
{
int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
- if (ret)
- uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
- else {
+ if (ret) {
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+ } else {
uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
UNCORE_SPR_MMIO_EXTRA_UNCORES,
- spr_mmio_uncores);
+ spr_mmio_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
}
}
/* end of SPR uncore support */
+
+/* GNR uncore support */
+
+#define UNCORE_GNR_NUM_UNCORE_TYPES 23
+#define UNCORE_GNR_TYPE_15 15
+#define UNCORE_GNR_B2UPI 18
+#define UNCORE_GNR_TYPE_21 21
+#define UNCORE_GNR_TYPE_22 22
+
+int gnr_uncore_units_ignore[] = {
+ UNCORE_SPR_UPI,
+ UNCORE_GNR_TYPE_15,
+ UNCORE_GNR_B2UPI,
+ UNCORE_GNR_TYPE_21,
+ UNCORE_GNR_TYPE_22,
+ UNCORE_IGNORE_END
+};
+
+static struct intel_uncore_type gnr_uncore_ubox = {
+ .name = "ubox",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type gnr_uncore_b2cmi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "b2cmi",
+};
+
+static struct intel_uncore_type gnr_uncore_b2cxl = {
+ SPR_UNCORE_MMIO_COMMON_FORMAT(),
+ .name = "b2cxl",
+};
+
+static struct intel_uncore_type gnr_uncore_mdf_sbo = {
+ .name = "mdf_sbo",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
+ &spr_uncore_chabox,
+ &spr_uncore_iio,
+ &spr_uncore_irp,
+ NULL,
+ &spr_uncore_pcu,
+ &gnr_uncore_ubox,
+ &spr_uncore_imc,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &gnr_uncore_b2cmi,
+ &gnr_uncore_b2cxl,
+ NULL,
+ NULL,
+ &gnr_uncore_mdf_sbo,
+ NULL,
+ NULL,
+};
+
+static struct freerunning_counters gnr_iio_freerunning[] = {
+ [SPR_IIO_MSR_IOCLK] = { 0x290e, 0x01, 0x10, 1, 48 },
+ [SPR_IIO_MSR_BW_IN] = { 0x360e, 0x10, 0x80, 8, 48 },
+ [SPR_IIO_MSR_BW_OUT] = { 0x2e0e, 0x10, 0x80, 8, 48 },
+};
+
+void gnr_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+ UNCORE_SPR_MSR_EXTRA_UNCORES,
+ spr_msr_uncores,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+ spr_uncore_iio_free_running.freerunning = gnr_iio_freerunning;
+}
+
+int gnr_uncore_pci_init(void)
+{
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ return 0;
+}
+
+void gnr_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+}
+
+/* end of GNR uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index c8ba2be75..fb5651835 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -110,6 +110,11 @@ static inline bool is_topdown_event(struct perf_event *event)
return is_metric_event(event) || is_slots_event(event);
}
+static inline bool is_branch_counters_group(struct perf_event *event)
+{
+ return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
+}
+
struct amd_nb {
int nb_id; /* NorthBridge id */
int refcnt; /* reference count */
@@ -283,6 +288,7 @@ struct cpu_hw_events {
int lbr_pebs_users;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ u64 lbr_counters[MAX_LBR_ENTRIES]; /* branch stack extra */
union {
struct er_account *lbr_sel;
struct er_account *lbr_ctl;
@@ -652,10 +658,29 @@ enum {
#define PERF_PEBS_DATA_SOURCE_MAX 0x10
#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
+enum hybrid_cpu_type {
+ HYBRID_INTEL_NONE,
+ HYBRID_INTEL_ATOM = 0x20,
+ HYBRID_INTEL_CORE = 0x40,
+};
+
+enum hybrid_pmu_type {
+ not_hybrid,
+ hybrid_small = BIT(0),
+ hybrid_big = BIT(1),
+
+ hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */
+};
+
+#define X86_HYBRID_PMU_ATOM_IDX 0
+#define X86_HYBRID_PMU_CORE_IDX 1
+
+#define X86_HYBRID_NUM_PMUS 2
+
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
- u8 cpu_type;
+ enum hybrid_pmu_type pmu_type;
cpumask_t supported_cpus;
union perf_capabilities intel_cap;
u64 intel_ctrl;
@@ -721,18 +746,6 @@ extern struct static_key_false perf_is_hybrid;
__Fp; \
})
-enum hybrid_pmu_type {
- hybrid_big = 0x40,
- hybrid_small = 0x20,
-
- hybrid_big_small = hybrid_big | hybrid_small,
-};
-
-#define X86_HYBRID_PMU_ATOM_IDX 0
-#define X86_HYBRID_PMU_CORE_IDX 1
-
-#define X86_HYBRID_NUM_PMUS 2
-
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -881,6 +894,7 @@ struct x86_pmu {
unsigned int lbr_mispred:1;
unsigned int lbr_timed_lbr:1;
unsigned int lbr_br_type:1;
+ unsigned int lbr_counters:4;
void (*lbr_reset)(void);
void (*lbr_read)(struct cpu_hw_events *cpuc);
@@ -940,7 +954,7 @@ struct x86_pmu {
*/
int num_hybrid_pmus;
struct x86_hybrid_pmu *hybrid_pmu;
- u8 (*get_hybrid_cpu_type) (void);
+ enum hybrid_cpu_type (*get_hybrid_cpu_type) (void);
};
struct x86_perf_task_context_opt {
@@ -1005,6 +1019,7 @@ do { \
#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */
+#define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1521,7 +1536,7 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
extern struct event_constraint intel_icl_pebs_event_constraints[];
-extern struct event_constraint intel_spr_pebs_event_constraints[];
+extern struct event_constraint intel_glc_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
@@ -1545,6 +1560,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 1dc19b9b4..6c977c19f 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -20,3 +20,5 @@ PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */
PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */
PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */
PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */
+PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */
+PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 157942984..8d98d468b 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -115,7 +115,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int maxdie;
- struct rapl_pmu *pmus[];
+ struct rapl_pmu *pmus[] __counted_by(maxdie);
};
enum rapl_unit_quirk {
@@ -179,15 +179,11 @@ static u64 rapl_event_update(struct perf_event *event)
s64 delta, sdelta;
int shift = RAPL_CNTR_WIDTH;
-again:
prev_raw_count = local64_read(&hwc->prev_count);
- rdmsrl(event->hw.event_base, new_raw_count);
-
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count) {
- cpu_relax();
- goto again;
- }
+ do {
+ rdmsrl(event->hw.event_base, new_raw_count);
+ } while (!local64_try_cmpxchg(&hwc->prev_count,
+ &prev_raw_count, new_raw_count));
/*
* Now we have the new raw value and have updated the prev
@@ -537,11 +533,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
* - want to use same event codes across both architectures
*/
static struct perf_msr amd_rapl_msrs[] = {
- [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 },
+ [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
- [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 },
- [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 },
- [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 },
+ [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
+ [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, NULL, false, 0 },
+ [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
static int rapl_cpu_offline(unsigned int cpu)
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 97bfe5f05..5fc45543e 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -209,7 +209,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
/*
* This particular version of the IPI hypercall can
- * only target upto 64 CPUs.
+ * only target up to 64 CPUs.
*/
if (vcpu >= 64)
goto do_ex_hypercall;
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 999f5ac82..c8062975a 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -12,10 +12,16 @@
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
+#include <../kernel/smpboot.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
+static bool __init hv_vtl_msi_ext_dest_id(void)
+{
+ return true;
+}
+
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
@@ -38,6 +44,8 @@ void __init hv_vtl_init_platform(void)
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 0;
+
+ x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id;
}
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
@@ -57,7 +65,7 @@ static void hv_vtl_ap_entry(void)
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
-static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored)
{
u64 status;
int ret = 0;
@@ -71,7 +79,9 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
struct ldttss_desc *ldt;
struct desc_struct *gdt;
- u64 rsp = current->thread.sp;
+ struct task_struct *idle = idle_thread_get(cpu);
+ u64 rsp = (unsigned long)idle->thread.sp;
+
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
@@ -196,9 +206,17 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id)
return ret;
}
-static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
- int vp_id;
+ int vp_id, cpu;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apicid))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
@@ -212,7 +230,7 @@ static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, start_eip);
+ return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 42c70d28e..3215a4a07 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -212,7 +212,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
* This interrupt is already mapped. Let's unmap first.
*
* We don't use retarget interrupt hypercalls here because
- * Microsoft Hypervisor doens't allow root to change the vector
+ * Microsoft Hypervisor doesn't allow root to change the vector
* or specify VPs outside of the set that is initially used
* during mapping.
*/
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 8c6bf07f7..768d73de0 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -15,6 +15,7 @@
#include <asm/io.h>
#include <asm/coco.h>
#include <asm/mem_encrypt.h>
+#include <asm/set_memory.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/mtrr.h>
@@ -144,7 +145,7 @@ void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason)
/* Tell the hypervisor what went wrong. */
val |= GHCB_SEV_TERM_REASON(set, reason);
- /* Request Guest Termination from Hypvervisor */
+ /* Request Guest Termination from Hypervisor */
wr_ghcb_msr(val);
VMGEXIT();
@@ -288,7 +289,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
{
struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
__get_free_page(GFP_KERNEL | __GFP_ZERO);
@@ -384,7 +385,7 @@ static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
#ifdef CONFIG_INTEL_TDX_GUEST
static void hv_tdx_msr_write(u64 msr, u64 val)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = EXIT_REASON_MSR_WRITE,
.r12 = msr,
@@ -398,13 +399,13 @@ static void hv_tdx_msr_write(u64 msr, u64 val)
static void hv_tdx_msr_read(u64 msr, u64 *val)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = EXIT_REASON_MSR_READ,
.r12 = msr,
};
- u64 ret = __tdx_hypercall_ret(&args);
+ u64 ret = __tdx_hypercall(&args);
if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
*val = 0;
@@ -414,13 +415,13 @@ static void hv_tdx_msr_read(u64 msr, u64 *val)
u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
{
- struct tdx_hypercall_args args = { };
+ struct tdx_module_args args = { };
args.r10 = control;
args.rdx = param1;
args.r8 = param2;
- (void)__tdx_hypercall_ret(&args);
+ (void)__tdx_hypercall(&args);
return args.r11;
}
@@ -503,6 +504,31 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
}
/*
+ * When transitioning memory between encrypted and decrypted, the caller
+ * of set_memory_encrypted() or set_memory_decrypted() is responsible for
+ * ensuring that the memory isn't in use and isn't referenced while the
+ * transition is in progress. The transition has multiple steps, and the
+ * memory is in an inconsistent state until all steps are complete. A
+ * reference while the state is inconsistent could result in an exception
+ * that can't be cleanly fixed up.
+ *
+ * But the Linux kernel load_unaligned_zeropad() mechanism could cause a
+ * stray reference that can't be prevented by the caller, so Linux has
+ * specific code to handle this case. But when the #VC and #VE exceptions
+ * routed to a paravisor, the specific code doesn't work. To avoid this
+ * problem, mark the pages as "not present" while the transition is in
+ * progress. If load_unaligned_zeropad() causes a stray reference, a normal
+ * page fault is generated instead of #VC or #VE, and the page-fault-based
+ * handlers for load_unaligned_zeropad() resolve the reference. When the
+ * transition is complete, hv_vtom_set_host_visibility() marks the pages
+ * as "present" again.
+ */
+static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
+{
+ return !set_memory_np(kbuffer, pagecount);
+}
+
+/*
* hv_vtom_set_host_visibility - Set specified memory visible to host.
*
* In Isolation VM, all guest memory is encrypted from host and guest
@@ -515,16 +541,28 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
enum hv_mem_host_visibility visibility = enc ?
VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
+ phys_addr_t paddr;
+ void *vaddr;
int ret = 0;
bool result = true;
int i, pfn;
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
- if (!pfn_array)
- return false;
+ if (!pfn_array) {
+ result = false;
+ goto err_set_memory_p;
+ }
for (i = 0, pfn = 0; i < pagecount; i++) {
- pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
+ /*
+ * Use slow_virt_to_phys() because the PRESENT bit has been
+ * temporarily cleared in the PTEs. slow_virt_to_phys() works
+ * without the PRESENT bit while virt_to_hvpfn() or similar
+ * does not.
+ */
+ vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE);
+ paddr = slow_virt_to_phys(vaddr);
+ pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT;
pfn++;
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
@@ -538,14 +576,30 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
}
}
- err_free_pfn_array:
+err_free_pfn_array:
kfree(pfn_array);
+
+err_set_memory_p:
+ /*
+ * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present()
+ * did. Do this even if there is an error earlier in this function in
+ * order to avoid leaving the memory range in a "broken" state. Setting
+ * the PRESENT bits shouldn't fail, but return an error if it does.
+ */
+ if (set_memory_p(kbuffer, pagecount))
+ result = false;
+
return result;
}
static bool hv_vtom_tlb_flush_required(bool private)
{
- return true;
+ /*
+ * Since hv_vtom_clear_present() marks the PTEs as "not present"
+ * and flushes the TLB, they can't be in the TLB. That makes the
+ * flush controlled by this function redundant, so return "false".
+ */
+ return false;
}
static bool hv_vtom_cache_flush_required(void)
@@ -608,6 +662,7 @@ void __init hv_vtom_init(void)
x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
+ x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
/* Set WB as the default cache mode. */
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4f1ce5fc4..a192bdea6 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,5 +10,4 @@ generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
generic-y += early_ioremap.h
-generic-y += export.h
generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 65f79092c..fcd20c6dc 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -10,6 +10,9 @@
#define ALT_FLAG_NOT (1 << 0)
#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_FLAG_DIRECT_CALL (1 << 1)
+#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS)
#ifndef __ASSEMBLY__
@@ -86,6 +89,8 @@ struct alt_instr {
u8 replacementlen; /* length of new instruction */
} __packed;
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
/*
* Debug flag that can be tested to see whether alternative
* instructions were patched in already:
@@ -101,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
s32 *start_cfi, s32 *end_cfi);
struct module;
-struct paravirt_patch_site;
struct callthunk_sites {
s32 *call_start, *call_end;
- struct paravirt_patch_site *pv_start, *pv_end;
+ struct alt_instr *alt_start, *alt_end;
};
#ifdef CONFIG_CALL_THUNKS
@@ -150,6 +154,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
+#define ALT_CALL_INSTR "call BUG_func"
+
#define b_replacement(num) "664"#num
#define e_replacement(num) "665"#num
@@ -330,6 +336,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+/* Macro for creating assembler functions avoiding any C magic. */
+#define DEFINE_ASM_FUNC(func, instr, sec) \
+ asm (".pushsection " #sec ", \"ax\"\n" \
+ ".global " #func "\n\t" \
+ ".type " #func ", @function\n\t" \
+ ASM_FUNC_ALIGN "\n" \
+ #func ":\n\t" \
+ ASM_ENDBR \
+ instr "\n\t" \
+ ASM_RET \
+ ".size " #func ", . - " #func "\n\t" \
+ ".popsection")
+
+void BUG_func(void);
+void nop_func(void);
+
#else /* __ASSEMBLY__ */
#ifdef CONFIG_SMP
@@ -370,6 +392,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
.byte \alt_len
.endm
+.macro ALT_CALL_INSTR
+ call BUG_func
+.endm
+
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index ed0eaf65c..5c37944c8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -104,7 +104,7 @@ static inline bool amd_gart_present(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return false;
- /* GART present only on Fam15h, upto model 0fh */
+ /* GART present only on Fam15h, up to model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
(boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
return true;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5af4ec1a0..dddd3fc19 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -13,6 +13,7 @@
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/hardirq.h>
+#include <asm/io.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -54,7 +55,7 @@ extern int local_apic_timer_c2_ok;
extern bool apic_is_disabled;
extern unsigned int lapic_timer_period;
-extern int cpuid_to_apicid[];
+extern u32 cpuid_to_apicid[];
extern enum apic_intr_mode_id apic_intr_mode;
enum apic_intr_mode_id {
@@ -96,7 +97,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
static inline u32 native_apic_mem_read(u32 reg)
{
- return *((volatile u32 *)(APIC_BASE + reg));
+ return readl((void __iomem *)(APIC_BASE + reg));
}
static inline void native_apic_mem_eoi(void)
@@ -272,11 +273,10 @@ struct apic {
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
- enum apic_delivery_modes delivery_mode;
-
u32 disable_esr : 1,
dest_mode_logical : 1,
- x2apic_set_max_apicid : 1;
+ x2apic_set_max_apicid : 1,
+ nmi_to_offline_cpu : 1;
u32 (*calc_dest_apicid)(unsigned int cpu);
@@ -292,19 +292,19 @@ struct apic {
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
bool (*apic_id_registered)(void);
- bool (*check_apicid_used)(physid_mask_t *map, int apicid);
+ bool (*check_apicid_used)(physid_mask_t *map, u32 apicid);
void (*init_apic_ldr)(void);
void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
- int (*cpu_present_to_apicid)(int mps_cpu);
- int (*phys_pkg_id)(int cpuid_apic, int index_msb);
+ u32 (*cpu_present_to_apicid)(int mps_cpu);
+ u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
- u32 (*get_apic_id)(unsigned long x);
- u32 (*set_apic_id)(unsigned int id);
+ u32 (*get_apic_id)(u32 id);
+ u32 (*set_apic_id)(u32 apicid);
/* wakeup_secondary_cpu */
- int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
/* wakeup secondary CPU using 64-bit wakeup point */
- int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
char *name;
};
@@ -322,8 +322,8 @@ struct apic_override {
void (*send_IPI_self)(int vector);
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
- int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
- int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
};
/*
@@ -493,16 +493,6 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
return !!(irr & (1U << (vector % 32)));
}
-static inline unsigned default_get_apic_id(unsigned long x)
-{
- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-
- if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
- return (x >> 24) & 0xFF;
- else
- return (x >> 24) & 0x0F;
-}
-
/*
* Warm reset vector position:
*/
@@ -517,9 +507,9 @@ extern void generic_bigsmp_probe(void);
extern struct apic apic_noop;
-static inline unsigned int read_apic_id(void)
+static inline u32 read_apic_id(void)
{
- unsigned int reg = apic_read(APIC_ID);
+ u32 reg = apic_read(APIC_ID);
return apic->get_apic_id(reg);
}
@@ -538,13 +528,14 @@ extern int default_apic_id_valid(u32 apicid);
extern u32 apic_default_calc_apicid(unsigned int cpu);
extern u32 apic_flat_calc_apicid(unsigned int cpu);
-extern bool default_check_apicid_used(physid_mask_t *map, int apicid);
extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
-extern int default_cpu_present_to_apicid(int mps_cpu);
+extern u32 default_cpu_present_to_apicid(int mps_cpu);
+
+void apic_send_nmi_to_offline_cpu(unsigned int cpu);
#else /* CONFIG_X86_LOCAL_APIC */
-static inline unsigned int read_apic_id(void) { return 0; }
+static inline u32 read_apic_id(void) { return 0; }
#endif /* !CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 4b125e5b3..094106b6a 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -20,6 +20,13 @@
*/
#define IO_APIC_SLOT_SIZE 1024
+#define APIC_DELIVERY_MODE_FIXED 0
+#define APIC_DELIVERY_MODE_LOWESTPRIO 1
+#define APIC_DELIVERY_MODE_SMI 2
+#define APIC_DELIVERY_MODE_NMI 4
+#define APIC_DELIVERY_MODE_INIT 5
+#define APIC_DELIVERY_MODE_EXTINT 7
+
#define APIC_ID 0x20
#define APIC_LVR 0x30
@@ -165,279 +172,10 @@
#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
-#ifndef __ASSEMBLY__
-/*
- * the local APIC register structure, memory mapped. Not terribly well
- * tested, but we might eventually use this one in the future - the
- * problem why we cannot use it right now is the P5 APIC, it has an
- * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
- */
-#define u32 unsigned int
-
-struct local_apic {
-
-/*000*/ struct { u32 __reserved[4]; } __reserved_01;
-
-/*010*/ struct { u32 __reserved[4]; } __reserved_02;
-
-/*020*/ struct { /* APIC ID Register */
- u32 __reserved_1 : 24,
- phys_apic_id : 4,
- __reserved_2 : 4;
- u32 __reserved[3];
- } id;
-
-/*030*/ const
- struct { /* APIC Version Register */
- u32 version : 8,
- __reserved_1 : 8,
- max_lvt : 8,
- __reserved_2 : 8;
- u32 __reserved[3];
- } version;
-
-/*040*/ struct { u32 __reserved[4]; } __reserved_03;
-
-/*050*/ struct { u32 __reserved[4]; } __reserved_04;
-
-/*060*/ struct { u32 __reserved[4]; } __reserved_05;
-
-/*070*/ struct { u32 __reserved[4]; } __reserved_06;
-
-/*080*/ struct { /* Task Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } tpr;
-
-/*090*/ const
- struct { /* Arbitration Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } apr;
-
-/*0A0*/ const
- struct { /* Processor Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } ppr;
-
-/*0B0*/ struct { /* End Of Interrupt Register */
- u32 eoi;
- u32 __reserved[3];
- } eoi;
-
-/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
-
-/*0D0*/ struct { /* Logical Destination Register */
- u32 __reserved_1 : 24,
- logical_dest : 8;
- u32 __reserved_2[3];
- } ldr;
-
-/*0E0*/ struct { /* Destination Format Register */
- u32 __reserved_1 : 28,
- model : 4;
- u32 __reserved_2[3];
- } dfr;
-
-/*0F0*/ struct { /* Spurious Interrupt Vector Register */
- u32 spurious_vector : 8,
- apic_enabled : 1,
- focus_cpu : 1,
- __reserved_2 : 22;
- u32 __reserved_3[3];
- } svr;
-
-/*100*/ struct { /* In Service Register */
-/*170*/ u32 bitfield;
- u32 __reserved[3];
- } isr [8];
-
-/*180*/ struct { /* Trigger Mode Register */
-/*1F0*/ u32 bitfield;
- u32 __reserved[3];
- } tmr [8];
-
-/*200*/ struct { /* Interrupt Request Register */
-/*270*/ u32 bitfield;
- u32 __reserved[3];
- } irr [8];
-
-/*280*/ union { /* Error Status Register */
- struct {
- u32 send_cs_error : 1,
- receive_cs_error : 1,
- send_accept_error : 1,
- receive_accept_error : 1,
- __reserved_1 : 1,
- send_illegal_vector : 1,
- receive_illegal_vector : 1,
- illegal_register_address : 1,
- __reserved_2 : 24;
- u32 __reserved_3[3];
- } error_bits;
- struct {
- u32 errors;
- u32 __reserved_3[3];
- } all_errors;
- } esr;
-
-/*290*/ struct { u32 __reserved[4]; } __reserved_08;
-
-/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
-
-/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
-
-/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
-
-/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
-
-/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
-
-/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
-
-/*300*/ struct { /* Interrupt Command Register 1 */
- u32 vector : 8,
- delivery_mode : 3,
- destination_mode : 1,
- delivery_status : 1,
- __reserved_1 : 1,
- level : 1,
- trigger : 1,
- __reserved_2 : 2,
- shorthand : 2,
- __reserved_3 : 12;
- u32 __reserved_4[3];
- } icr1;
-
-/*310*/ struct { /* Interrupt Command Register 2 */
- union {
- u32 __reserved_1 : 24,
- phys_dest : 4,
- __reserved_2 : 4;
- u32 __reserved_3 : 24,
- logical_dest : 8;
- } dest;
- u32 __reserved_4[3];
- } icr2;
-
-/*320*/ struct { /* LVT - Timer */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- timer_mode : 1,
- __reserved_3 : 14;
- u32 __reserved_4[3];
- } lvt_timer;
-
-/*330*/ struct { /* LVT - Thermal Sensor */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_thermal;
-
-/*340*/ struct { /* LVT - Performance Counter */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_pc;
-
-/*350*/ struct { /* LVT - LINT0 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint0;
-
-/*360*/ struct { /* LVT - LINT1 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint1;
-
-/*370*/ struct { /* LVT - Error */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_error;
-
-/*380*/ struct { /* Timer Initial Count Register */
- u32 initial_count;
- u32 __reserved_2[3];
- } timer_icr;
-
-/*390*/ const
- struct { /* Timer Current Count Register */
- u32 curr_count;
- u32 __reserved_2[3];
- } timer_ccr;
-
-/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
-
-/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
-
-/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
-
-/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
-
-/*3E0*/ struct { /* Timer Divide Configuration Register */
- u32 divisor : 4,
- __reserved_1 : 28;
- u32 __reserved_2[3];
- } timer_dcr;
-
-/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
-
-} __attribute__ ((packed));
-
-#undef u32
-
#ifdef CONFIG_X86_32
#define BAD_APICID 0xFFu
#else
#define BAD_APICID 0xFFFFu
#endif
-enum apic_delivery_modes {
- APIC_DELIVERY_MODE_FIXED = 0,
- APIC_DELIVERY_MODE_LOWESTPRIO = 1,
- APIC_DELIVERY_MODE_SMI = 2,
- APIC_DELIVERY_MODE_NMI = 4,
- APIC_DELIVERY_MODE_INIT = 5,
- APIC_DELIVERY_MODE_EXTINT = 7,
-};
-
-#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index b1a98fa38..0e8207451 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -13,6 +13,7 @@
#include <asm/preempt.h>
#include <asm/asm.h>
#include <asm/gsseg.h>
+#include <asm/nospec-branch.h>
#ifndef CONFIG_X86_CMPXCHG64
extern void cmpxchg8b_emu(void);
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index fbcfec4dc..ca8eed1d4 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -113,6 +113,20 @@
#endif
+#ifndef __ASSEMBLY__
+#ifndef __pic__
+static __always_inline __pure void *rip_rel_ptr(void *p)
+{
+ asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
+
+ return p;
+}
+#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var)))
+#else
+#define RIP_REL_REF(var) (var)
+#endif
+#endif
+
/*
* Macros to generate condition code outputs from inline assembly,
* The output operand must be type "bool".
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 35389b2af..d0795b5fa 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -79,24 +79,9 @@ do { \
#define __smp_mb__before_atomic() do { } while (0)
#define __smp_mb__after_atomic() do { } while (0)
-#include <asm-generic/barrier.h>
+/* Writing to CR3 provides a full memory barrier in switch_mm(). */
+#define smp_mb__after_switch_mm() do { } while (0)
-/*
- * Make previous memory operations globally visible before
- * a WRMSR.
- *
- * MFENCE makes writes visible, but only affects load/store
- * instructions. WRMSR is unfortunately not a load/store
- * instruction and is unaffected by MFENCE. The LFENCE ensures
- * that the WRMSR is not reordered.
- *
- * Most WRMSRs are full serializing instructions themselves and
- * do not require this barrier. This is only required for the
- * IA32_TSC_DEADLINE and X2APIC MSRs.
- */
-static inline void weak_wrmsr_fence(void)
-{
- asm volatile("mfence; lfence" : : : "memory");
-}
+#include <asm-generic/barrier.h>
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 2edf68475..990eb686c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
-static __always_inline bool
-arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
+static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+ volatile unsigned long *addr)
{
bool negative;
- asm volatile(LOCK_PREFIX "andb %2,%1"
+ asm volatile(LOCK_PREFIX "xorb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), WBYTE_ADDR(addr)
- : "ir" ((char) ~(1 << nr)) : "memory");
+ : "iq" ((char)mask) : "memory");
return negative;
}
-#define arch_clear_bit_unlock_is_negative_byte \
- arch_clear_bit_unlock_is_negative_byte
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
@@ -293,6 +292,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
*/
static __always_inline unsigned long __fls(unsigned long word)
{
+ if (__builtin_constant_p(word))
+ return BITS_PER_LONG - 1 - __builtin_clzl(word);
+
asm("bsr %1,%0"
: "=r" (word)
: "rm" (word));
@@ -360,6 +362,9 @@ static __always_inline int fls(unsigned int x)
{
int r;
+ if (__builtin_constant_p(x))
+ return x ? 32 - __builtin_clz(x) : 0;
+
#ifdef CONFIG_X86_64
/*
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
@@ -401,6 +406,9 @@ static __always_inline int fls(unsigned int x)
static __always_inline int fls64(__u64 x)
{
int bitpos = -1;
+
+ if (__builtin_constant_p(x))
+ return x ? 64 - __builtin_clzll(x) : 0;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index b3a7cfb0d..a3e0be047 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -81,10 +81,13 @@
#ifndef __ASSEMBLY__
extern unsigned int output_len;
+extern const unsigned long kernel_text_size;
extern const unsigned long kernel_total_size;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x));
+
+extern struct boot_params *boot_params_ptr;
#endif
#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index ce9685fc7..5aa061199 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -7,9 +7,6 @@ extern unsigned int memory_caching_control;
#define CACHE_MTRR 0x01
#define CACHE_PAT 0x02
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
-
void cache_disable(void);
void cache_enable(void);
void set_cache_aps_delayed_init(bool val);
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 58dacd90d..7cd752557 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -7,16 +7,140 @@
*
* Copyright (C) 2022 Google LLC
*/
+#include <linux/bug.h>
+#include <asm/ibt.h>
-#include <linux/cfi.h>
+/*
+ * An overview of the various calling conventions...
+ *
+ * Traditional:
+ *
+ * foo:
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * IBT:
+ *
+ * foo:
+ * endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * kCFI:
+ *
+ * __cfi_foo:
+ * movl $0x12345678, %eax
+ * # 11 nops when CONFIG_CALL_PADDING
+ * foo:
+ * endbr64 # when IBT
+ * ... code here ...
+ * ret
+ *
+ * direct call:
+ * call foo # / call foo+4 when IBT
+ *
+ * indirect call:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $(-0x12345678), %r10d
+ * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING
+ * jz 1f
+ * ud2
+ * 1:call *%r11
+ *
+ *
+ * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into):
+ *
+ * __cfi_foo:
+ * endbr64
+ * subl 0x12345678, %r10d
+ * jz foo
+ * ud2
+ * nop
+ * foo:
+ * osp nop3 # was endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $0x12345678, %r10d
+ * subl $16, %r11
+ * nop4
+ * call *%r11
+ *
+ */
+enum cfi_mode {
+ CFI_DEFAULT, /* FineIBT if hardware has IBT, otherwise kCFI */
+ CFI_OFF, /* Taditional / IBT depending on .config */
+ CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */
+ CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */
+};
+
+extern enum cfi_mode cfi_mode;
+
+struct pt_regs;
#ifdef CONFIG_CFI_CLANG
enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+extern u32 cfi_bpf_hash;
+extern u32 cfi_bpf_subprog_hash;
+
+static inline int cfi_get_offset(void)
+{
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ return 16;
+ case CFI_KCFI:
+ if (IS_ENABLED(CONFIG_CALL_PADDING))
+ return 16;
+ return 5;
+ default:
+ return 0;
+ }
+}
+#define cfi_get_offset cfi_get_offset
+
+extern u32 cfi_get_func_hash(void *func);
+
#else
static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
return BUG_TRAP_TYPE_NONE;
}
+#define cfi_bpf_hash 0U
+#define cfi_bpf_subprog_hash 0U
+static inline u32 cfi_get_func_hash(void *func)
+{
+ return 0;
+}
#endif /* CONFIG_CFI_CLANG */
+#if HAS_KERNEL_IBT == 1
+#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x)))
+#endif
+
#endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d53636506..5612648b0 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -221,12 +221,18 @@ extern void __add_wrong_size(void)
#define __try_cmpxchg(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+#define __sync_try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ")
+
#define __try_cmpxchg_local(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), "")
#define arch_try_cmpxchg(ptr, pold, new) \
__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+#define arch_sync_try_cmpxchg(ptr, pold, new) \
+ __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
#define arch_try_cmpxchg_local(ptr, pold, new) \
__try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr)))
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 44b08b53a..c1d6cd58f 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -62,7 +62,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
asm volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
- [ptr] "+m" (*ptr), \
+ [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
: "memory"); \
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 6ae2d16a7..a368afce3 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_COCO_H
#define _ASM_X86_COCO_H
+#include <asm/asm.h>
#include <asm/types.h>
enum cc_vendor {
@@ -10,13 +11,21 @@ enum cc_vendor {
CC_VENDOR_INTEL,
};
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
extern enum cc_vendor cc_vendor;
+extern u64 cc_mask;
+static inline void cc_set_mask(u64 mask)
+{
+ RIP_REL_REF(cc_mask) = mask;
+}
-#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
-void cc_set_mask(u64 mask);
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
+void cc_random_init(void);
#else
+#define cc_vendor (CC_VENDOR_NONE)
+static const u64 cc_mask = 0;
+
static inline u64 cc_mkenc(u64 val)
{
return val;
@@ -26,6 +35,7 @@ static inline u64 cc_mkdec(u64 val)
{
return val;
}
+static inline void cc_random_init(void) { }
#endif
#endif /* _ASM_X86_COCO_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 25050d953..f8f9a9b79 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -23,10 +23,6 @@ static inline void prefill_possible_map(void) {}
#endif /* CONFIG_SMP */
-struct x86_cpu {
- struct cpu cpu;
-};
-
#ifdef CONFIG_HOTPLUG_CPU
extern void soft_restart_cpu(void);
#endif
@@ -71,26 +67,12 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {}
extern __noendbr void cet_disable(void);
-struct ucode_cpu_info;
-
-int intel_cpu_collect_info(struct ucode_cpu_info *uci);
-
-static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
+struct cpu_signature;
- /* ... or they intersect. */
- return p1 & p2;
-}
+void intel_collect_cpu_info(struct cpu_signature *sig);
extern u64 x86_read_arch_cap_msr(void);
-int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig);
int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
extern struct cpumask cpus_stop_mask;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a26bebbdf..686e92d26 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -33,6 +33,8 @@ enum cpuid_leafs
CPUID_7_EDX,
CPUID_8000_001F_EAX,
CPUID_8000_0021_EAX,
+ CPUID_LNX_5,
+ NR_CPUID_WORDS,
};
#define X86_CAP_FMT_NUM "%d:%d"
@@ -91,8 +93,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 21))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 22))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -116,8 +119,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 21))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 22))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
@@ -168,7 +172,7 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto(
+ asm goto(
ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 58cb9495e..76b3b00ff 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 21 /* N 32-bit words worth of info */
+#define NCAPINTS 22 /* N 32-bit words worth of info */
#define NBUGINTS 2 /* N 32-bit bug flags */
/*
@@ -81,10 +81,8 @@
#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-
-/* CPU types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */
+#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
@@ -97,7 +95,7 @@
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */
+#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
@@ -198,6 +196,7 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
@@ -218,7 +217,7 @@
#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
@@ -308,10 +307,14 @@
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */
+#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */
+#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */
+#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@@ -443,6 +446,7 @@
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
@@ -453,6 +457,17 @@
#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0x80000022, etc and Linux defined features.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */
+#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
+#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
+#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+
+/*
* BUG word(s)
*/
#define X86_BUG(x) (NCAPINTS*32 + (x))
@@ -494,8 +509,11 @@
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */
+#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */
/* BUG word 2 */
#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
+#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h
new file mode 100644
index 000000000..041020da8
--- /dev/null
+++ b/arch/x86/include/asm/crash_core.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_CRASH_CORE_H
+#define _X86_CRASH_CORE_H
+
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN SZ_16M
+
+/*
+ * Keep the crash kernel below this limit.
+ *
+ * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
+ * due to mapping restrictions.
+ *
+ * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
+ * the upper limit of system RAM in 4-level paging mode. Since the kdump
+ * jump could be from 5-level paging to 4-level paging, the jump will fail if
+ * the kernel is put above 64 TB, and during the 1st kernel bootup there's
+ * no good way to detect the paging mode of the target kernel which will be
+ * loaded for dumping.
+ */
+extern unsigned long swiotlb_size_or_default(void);
+
+#ifdef CONFIG_X86_32
+# define CRASH_ADDR_LOW_MAX SZ_512M
+# define CRASH_ADDR_HIGH_MAX SZ_512M
+#else
+# define CRASH_ADDR_LOW_MAX SZ_4G
+# define CRASH_ADDR_HIGH_MAX SZ_64T
+#endif
+
+# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default()
+
+static inline unsigned long crash_low_size_default(void)
+{
+#ifdef CONFIG_X86_64
+ return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
+#else
+ return 0;
+#endif
+}
+
+#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+
+#endif /* _X86_CRASH_CORE_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index a1168e7b6..dd4b67101 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
+#include <linux/build_bug.h>
#include <linux/compiler.h>
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 66eb5e1ac..0cec92c43 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -5,6 +5,7 @@
#include <linux/bug.h>
#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>
+#include <asm/cpufeature.h>
DECLARE_PER_CPU(unsigned long, cpu_dr7);
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index f7e7099af..d440a65af 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -8,6 +8,56 @@
* archs.
*/
+/*
+ * Low-level interface mapping flags/field names to bits
+ */
+
+/* Flags for _DESC_S (non-system) descriptors */
+#define _DESC_ACCESSED 0x0001
+#define _DESC_DATA_WRITABLE 0x0002
+#define _DESC_CODE_READABLE 0x0002
+#define _DESC_DATA_EXPAND_DOWN 0x0004
+#define _DESC_CODE_CONFORMING 0x0004
+#define _DESC_CODE_EXECUTABLE 0x0008
+
+/* Common flags */
+#define _DESC_S 0x0010
+#define _DESC_DPL(dpl) ((dpl) << 5)
+#define _DESC_PRESENT 0x0080
+
+#define _DESC_LONG_CODE 0x2000
+#define _DESC_DB 0x4000
+#define _DESC_GRANULARITY_4K 0x8000
+
+/* System descriptors have a numeric "type" field instead of flags */
+#define _DESC_SYSTEM(code) (code)
+
+/*
+ * High-level interface mapping intended usage to low-level combinations
+ * of flags
+ */
+
+#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_DATA_WRITABLE)
+#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE)
+
+#define DESC_DATA16 (_DESC_DATA)
+#define DESC_CODE16 (_DESC_CODE)
+
+#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB)
+
+#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB)
+
+#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT)
+
+#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE)
+
+#define DESC_USER (_DESC_DPL(3))
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
@@ -22,19 +72,19 @@ struct desc_struct {
#define GDT_ENTRY_INIT(flags, base, limit) \
{ \
- .limit0 = (u16) (limit), \
- .limit1 = ((limit) >> 16) & 0x0F, \
- .base0 = (u16) (base), \
- .base1 = ((base) >> 16) & 0xFF, \
- .base2 = ((base) >> 24) & 0xFF, \
- .type = (flags & 0x0f), \
- .s = (flags >> 4) & 0x01, \
- .dpl = (flags >> 5) & 0x03, \
- .p = (flags >> 7) & 0x01, \
- .avl = (flags >> 12) & 0x01, \
- .l = (flags >> 13) & 0x01, \
- .d = (flags >> 14) & 0x01, \
- .g = (flags >> 15) & 0x01, \
+ .limit0 = ((limit) >> 0) & 0xFFFF, \
+ .limit1 = ((limit) >> 16) & 0x000F, \
+ .base0 = ((base) >> 0) & 0xFFFF, \
+ .base1 = ((base) >> 16) & 0x00FF, \
+ .base2 = ((base) >> 24) & 0x00FF, \
+ .type = ((flags) >> 0) & 0x000F, \
+ .s = ((flags) >> 4) & 0x0001, \
+ .dpl = ((flags) >> 5) & 0x0003, \
+ .p = ((flags) >> 7) & 0x0001, \
+ .avl = ((flags) >> 12) & 0x0001, \
+ .l = ((flags) >> 13) & 0x0001, \
+ .d = ((flags) >> 14) & 0x0001, \
+ .g = ((flags) >> 15) & 0x0001, \
}
enum {
@@ -94,6 +144,7 @@ struct gate_struct {
typedef struct gate_struct gate_desc;
+#ifndef _SETUP
static inline unsigned long gate_offset(const gate_desc *g)
{
#ifdef CONFIG_X86_64
@@ -108,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g)
{
return g->segment;
}
+#endif
struct desc_ptr {
unsigned short size;
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 702d93fdd..88fcf0845 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -143,6 +143,7 @@
#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK19 0
#define DISABLED_MASK20 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define DISABLED_MASK21 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 18fd06f79..1e16bd5ac 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -7,6 +7,7 @@
*/
#include <linux/thread_info.h>
+#include <asm/ia32.h>
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
@@ -149,7 +150,7 @@ do { \
((x)->e_machine == EM_X86_64)
#define compat_elf_check_arch(x) \
- (elf_check_arch_ia32(x) || \
+ ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
static inline void elf_common_init(struct thread_struct *t,
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index ce8f50192..7e523bb3d 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
- mds_user_clear_cpu_buffers();
amd_clear_divider();
}
#define arch_exit_to_user_mode arch_exit_to_user_mode
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 991e31cfd..fe6312045 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -4,7 +4,7 @@
/*
* Our IMM is signed, as such it must live at the top end of the word. Also,
- * since C99 hex constants are of ambigious type, force cast the mask to 'int'
+ * since C99 hex constants are of ambiguous type, force cast the mask to 'int'
* so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
*/
#define EX_DATA_TYPE_MASK ((int)0x000000FF)
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 23873da8f..c3b9582de 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -2,12 +2,14 @@
#ifndef _ASM_X86_FB_H
#define _ASM_X86_FB_H
+#include <asm/page.h>
+
struct fb_info;
-struct file;
-struct vm_area_struct;
-void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off);
-#define fb_pgprotect fb_pgprotect
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+ unsigned long vm_start, unsigned long vm_end,
+ unsigned long offset);
+#define pgprot_framebuffer pgprot_framebuffer
int fb_is_primary_device(struct fb_info *info);
#define fb_is_primary_device fb_is_primary_device
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb810074f..ace9aa3b7 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -5,6 +5,8 @@
#ifndef _ASM_X86_FPU_H
#define _ASM_X86_FPU_H
+#include <asm/page_types.h>
+
/*
* The legacy x87 FPU state format, as saved by FSAVE and
* restored by the FRSTOR instructions:
@@ -415,7 +417,7 @@ struct fpu_state_perm {
*
* This master permission field is only to be used when
* task.fpu.fpstate based checks fail to validate whether the task
- * is allowed to expand it's xfeatures set which requires to
+ * is allowed to expand its xfeatures set which requires to
* allocate a larger sized fpstate buffer.
*
* Do not access this field directly. Use the provided helper
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 551829884..b02c3cd3c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,7 +28,7 @@
#include <asm/irq.h>
#include <asm/sections.h>
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
struct irq_data;
struct pci_dev;
struct msi_desc;
@@ -105,10 +105,10 @@ static inline void irq_complete_move(struct irq_cfg *c) { }
#endif
extern void apic_ack_edge(struct irq_data *data);
-#else /* CONFIG_X86_LOCAL_APIC */
+#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
static inline void lock_vector_lock(void) {}
static inline void unlock_vector_lock(void) {}
-#endif /* CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
/* Statistics */
extern atomic_t irq_err_count;
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 980562947..c7ef6ea2f 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_IA32_H
#define _ASM_X86_IA32_H
-
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -91,4 +90,14 @@ static inline void ia32_disable(void) {}
#endif
+static inline bool ia32_enabled_verbose(void)
+{
+ bool enabled = ia32_enabled();
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled)
+ pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n");
+
+ return enabled;
+}
+
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 5f1d3c421..cc9ccf61b 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_INIT_H
#define _ASM_X86_INIT_H
+#define __head __section(".head.text")
+
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
void *context; /* context for alloc_pgt_page */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 197316121..b65e9c46b 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -162,6 +162,8 @@
#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
+#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */
+
/* Xeon Phi */
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 762388424..294cd2a40 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -242,7 +242,7 @@ static inline void slow_down_io(void)
#endif
-#define BUILDIO(bwl, bw, type) \
+#define BUILDIO(bwl, type) \
static inline void out##bwl##_p(type value, u16 port) \
{ \
out##bwl(value, port); \
@@ -288,9 +288,9 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
} \
}
-BUILDIO(b, b, u8)
-BUILDIO(w, w, u16)
-BUILDIO(l, , u32)
+BUILDIO(b, u8)
+BUILDIO(w, u16)
+BUILDIO(l, u32)
#undef BUILDIO
#define inb_p inb_p
@@ -379,7 +379,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
const u8 *end = from + count * 64;
while (from < end) {
- movdir64b(dst, from);
+ movdir64b_io(dst, from);
from += 64;
}
}
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index a1911fea8..af7541c11 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
* This function will block all kernel access to the PMIC I2C bus, so that the
* P-Unit can safely access the PMIC over the shared I2C bus.
*
- * Note on these systems the i2c-bus driver will request a sempahore from the
+ * Note on these systems the i2c-bus driver will request a semaphore from the
* P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing
* it, but this does not appear to be sufficient, we still need to avoid making
* certain P-Unit requests during the access window to avoid problems.
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 800ffce0d..6b4d36c95 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void)
{
return boot_cpu_has(X86_FEATURE_APIC);
}
-extern void arch_irq_work_raise(void);
#else
static inline bool arch_irq_work_has_interrupt(void)
{
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 071572e23..cbbef3251 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -24,7 +24,7 @@
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
- asm_volatile_goto("1:"
+ asm goto("1:"
"jmp %l[l_yes] # objtool NOPs this \n\t"
JUMP_TABLE_ENTRY
: : "i" (key), "i" (2 | branch) : : l_yes);
@@ -38,7 +38,7 @@ l_yes:
static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
- asm_volatile_goto("1:"
+ asm goto("1:"
".byte " __stringify(BYTES_NOP5) "\n\t"
JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
@@ -52,7 +52,7 @@ l_yes:
static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
- asm_volatile_goto("1:"
+ asm goto("1:"
"jmp %l[l_yes]\n\t"
JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h
index 8fa6ac0e2..d91b37f5b 100644
--- a/arch/x86/include/asm/kmsan.h
+++ b/arch/x86/include/asm/kmsan.h
@@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr)
{
unsigned long x = (unsigned long)addr;
unsigned long y = x - __START_KERNEL_map;
+ bool ret;
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
@@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr)
return false;
}
- return pfn_valid(x >> PAGE_SHIFT);
+ /*
+ * pfn_valid() relies on RCU, and may call into the scheduler on exiting
+ * the critical section. However, this would result in recursion with
+ * KMSAN. Therefore, disable preemption here, and re-enable preemption
+ * below while suppressing reschedules to avoid recursion.
+ *
+ * Note, this sacrifices occasionally breaking scheduling guarantees.
+ * Although, a kernel compiled with KMSAN has already given up on any
+ * performance guarantees due to being heavily instrumented.
+ */
+ preempt_disable();
+ ret = pfn_valid(x >> PAGE_SHIFT);
+ preempt_enable_no_resched();
+
+ return ret;
}
#endif /* !MODULE */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index a2e9317aa..5939694df 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -113,8 +113,6 @@ struct kprobe_ctlblk {
};
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data);
extern int kprobe_int3_handler(struct pt_regs *regs);
#else
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 9b419f0de..378ed944b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -55,8 +55,10 @@ KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
KVM_X86_OP(flush_tlb_all)
KVM_X86_OP(flush_tlb_current)
+#if IS_ENABLED(CONFIG_HYPERV)
KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
+#endif
KVM_X86_OP(flush_tlb_gva)
KVM_X86_OP(flush_tlb_guest)
KVM_X86_OP(vcpu_pre_run)
@@ -127,7 +129,7 @@ KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
-KVM_X86_OP(can_emulate_instruction)
+KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
@@ -135,6 +137,7 @@ KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+KVM_X86_OP_OPTIONAL(get_untagged_addr)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fb9f5fa96..b7a539ea2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -39,7 +39,15 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+/*
+ * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
+ * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
+ */
+#ifdef CONFIG_KVM_MAX_NR_VCPUS
+#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
+#else
#define KVM_MAX_VCPUS 1024
+#endif
/*
* In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
@@ -125,7 +133,8 @@
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
- | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
+ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
+ | X86_CR4_LAM_SUP))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -492,8 +501,23 @@ struct kvm_pmc {
u8 idx;
bool is_paused;
bool intr;
+ /*
+ * Base value of the PMC counter, relative to the *consumed* count in
+ * the associated perf_event. This value includes counter updates from
+ * the perf_event and emulated_count since the last time the counter
+ * was reprogrammed, but it is *not* the current value as seen by the
+ * guest or userspace.
+ *
+ * The count is relative to the associated perf_event so that KVM
+ * doesn't need to reprogram the perf_event every time the guest writes
+ * to the counter.
+ */
u64 counter;
- u64 prev_counter;
+ /*
+ * PMC events triggered by KVM emulation that haven't been fully
+ * processed, i.e. haven't undergone overflow detection.
+ */
+ u64 emulated_counter;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
@@ -679,6 +703,7 @@ struct kvm_hypervisor_cpuid {
u32 limit;
};
+#ifdef CONFIG_KVM_XEN
/* Xen HVM per vcpu emulation context */
struct kvm_vcpu_xen {
u64 hypercall_rip;
@@ -701,6 +726,7 @@ struct kvm_vcpu_xen {
struct timer_list poll_timer;
struct kvm_hypervisor_cpuid cpuid;
};
+#endif
struct kvm_queued_exception {
bool pending;
@@ -828,6 +854,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ bool is_amd_compatible;
/*
* FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
@@ -927,10 +954,13 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
+#ifdef CONFIG_KVM_HYPERV
bool hyperv_enabled;
struct kvm_vcpu_hv *hyperv;
+#endif
+#ifdef CONFIG_KVM_XEN
struct kvm_vcpu_xen xen;
-
+#endif
cpumask_var_t wbinvd_dirty_mask;
unsigned long last_retry_eip;
@@ -1084,6 +1114,7 @@ enum hv_tsc_page_status {
HV_TSC_PAGE_BROKEN,
};
+#ifdef CONFIG_KVM_HYPERV
/* Hyper-V emulation context */
struct kvm_hv {
struct mutex hv_lock;
@@ -1114,9 +1145,11 @@ struct kvm_hv {
*/
unsigned int synic_auto_eoi_used;
- struct hv_partition_assist_pg *hv_pa_pg;
struct kvm_hv_syndbg hv_syndbg;
+
+ bool xsaves_xsavec_checked;
};
+#endif
struct msr_bitmap_range {
u32 flags;
@@ -1125,6 +1158,7 @@ struct msr_bitmap_range {
unsigned long *bitmap;
};
+#ifdef CONFIG_KVM_XEN
/* Xen emulation context */
struct kvm_xen {
struct mutex xen_lock;
@@ -1136,6 +1170,7 @@ struct kvm_xen {
struct idr evtchn_ports;
unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
};
+#endif
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
@@ -1244,6 +1279,7 @@ enum kvm_apicv_inhibit {
};
struct kvm_arch {
+ unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
@@ -1275,7 +1311,6 @@ struct kvm_arch {
*/
spinlock_t mmu_unsync_pages_lock;
- struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1323,6 +1358,7 @@ struct kvm_arch {
int nr_vcpus_matched_tsc;
u32 default_tsc_khz;
+ bool user_set_tsc;
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
@@ -1336,8 +1372,13 @@ struct kvm_arch {
/* reads protected by irq_srcu, writes by irq_lock */
struct hlist_head mask_notifier_list;
+#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
+#endif
+
+#ifdef CONFIG_KVM_XEN
struct kvm_xen xen;
+#endif
bool backwards_tsc_observed;
bool boot_vcpu_runs_old_kvmclock;
@@ -1395,9 +1436,8 @@ struct kvm_arch {
* the MMU lock in read mode + RCU or
* the MMU lock in write mode
*
- * For writes, this list is protected by:
- * the MMU lock in read mode + the tdp_mmu_pages_lock or
- * the MMU lock in write mode
+ * For writes, this list is protected by tdp_mmu_pages_lock; see
+ * below for the details.
*
* Roots will remain in the list until their tdp_mmu_root_count
* drops to zero, at which point the thread that decremented the
@@ -1414,8 +1454,10 @@ struct kvm_arch {
* - possible_nx_huge_pages;
* - the possible_nx_huge_page_link field of kvm_mmu_page structs used
* by the TDP MMU
- * It is acceptable, but not necessary, to acquire this lock when
- * the thread holds the MMU lock in write mode.
+ * Because the lock is only taken within the MMU lock, strictly
+ * speaking it is redundant to acquire this lock when the thread
+ * holds the MMU lock in write mode. However it often simplifies
+ * the code to do so.
*/
spinlock_t tdp_mmu_pages_lock;
#endif /* CONFIG_X86_64 */
@@ -1430,6 +1472,7 @@ struct kvm_arch {
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
spinlock_t hv_root_tdp_lock;
+ struct hv_partition_assist_pg *hv_pa_pg;
#endif
/*
* VM-scope maximum vCPU ID. Used to determine the size of structures
@@ -1602,9 +1645,11 @@ struct kvm_x86_ops {
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
+#if IS_ENABLED(CONFIG_HYPERV)
int (*flush_remote_tlbs)(struct kvm *kvm);
int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
gfn_t nr_pages);
+#endif
/*
* Flush any TLB entries associated with the given GVA.
@@ -1641,7 +1686,7 @@ struct kvm_x86_ops {
/* Whether or not a virtual NMI is pending in hardware. */
bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
/*
- * Attempt to pend a virtual NMI in harware. Returns %true on success
+ * Attempt to pend a virtual NMI in hardware. Returns %true on success
* to allow using static_call_ret0 as the fallback.
*/
bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
@@ -1691,7 +1736,7 @@ struct kvm_x86_ops {
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
- void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+ void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
@@ -1734,8 +1779,8 @@ struct kvm_x86_ops {
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len);
+ int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
@@ -1750,6 +1795,8 @@ struct kvm_x86_ops {
* Returns vCPU specific APICv inhibit reasons
*/
unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
+
+ gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
};
struct kvm_x86_nested_ops {
@@ -1813,6 +1860,7 @@ static inline struct kvm *kvm_arch_alloc_vm(void)
#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
+#if IS_ENABLED(CONFIG_HYPERV)
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
@@ -1824,6 +1872,15 @@ static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
}
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
+ u64 nr_pages)
+{
+ if (!kvm_x86_ops.flush_remote_tlbs_range)
+ return -EOPNOTSUPP;
+
+ return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+}
+#endif /* CONFIG_HYPERV */
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
@@ -1837,6 +1894,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -2075,6 +2135,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
+#ifdef CONFIG_KVM_PRIVATE_MEM
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#else
+#define kvm_arch_has_private_mem(kvm) false
+#endif
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -2122,16 +2188,15 @@ enum {
#define HF_SMM_MASK (1 << 1)
#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
-# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
-# define KVM_ADDRESS_SPACE_NUM 2
+# define KVM_MAX_NR_ADDRESS_SPACES 2
+/* SMM is currently unsupported for guests with private memory. */
+# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2)
# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
#else
# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
#endif
-#define KVM_ARCH_WANT_MMU_NOTIFIER
-
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 635132a12..73dba8b94 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
/**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
* @l: pointer of type local_t
* @a: the amount to add to l...
* @u: ...unless l is equal to u.
*
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
*/
-#define local_add_unless(l, a, u) \
-({ \
- long c, old; \
- c = local_read((l)); \
- for (;;) { \
- if (unlikely(c == (u))) \
- break; \
- old = local_cmpxchg((l), c, c + (a)); \
- if (likely(old == c)) \
- break; \
- c = old; \
- } \
- c != (u); \
-})
+static __always_inline bool
+local_add_unless(local_t *l, long a, long u)
+{
+ long c = local_read(l);
+
+ do {
+ if (unlikely(c == u))
+ return false;
+ } while (!local_try_cmpxchg(l, &c, c + a));
+
+ return true;
+}
+
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
/* On x86_32, these are no better than the atomic variants.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 180b1cbfc..de3118305 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {}
int mce_available(struct cpuinfo_x86 *c);
bool mce_is_memory_error(struct mce *m);
bool mce_is_correctable(struct mce *m);
-int mce_usable_address(struct mce *m);
+bool mce_usable_address(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);
@@ -311,6 +311,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -326,6 +327,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System HUB Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -333,7 +336,6 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
-extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
extern int mce_threshold_create_device(unsigned int cpu);
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 473b16d73..f922b682b 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -15,12 +15,15 @@
#include <linux/init.h>
#include <linux/cc_platform.h>
-#include <asm/bootparam.h>
+#include <asm/asm.h>
+struct boot_params;
#ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void);
+void __init mem_encrypt_setup_arch(void);
#else
static inline void mem_encrypt_init(void) { }
+static inline void __init mem_encrypt_setup_arch(void) { }
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -43,10 +46,9 @@ void __init sme_map_bootdata(char *real_mode_data);
void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
-void __init sev_setup_arch(void);
-void __init sme_encrypt_kernel(struct boot_params *bp);
-void __init sme_enable(struct boot_params *bp);
+void sme_encrypt_kernel(struct boot_params *bp);
+void sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
@@ -57,6 +59,11 @@ void __init mem_encrypt_free_decrypted_mem(void);
void __init sev_es_init_vc_handling(void);
+static inline u64 sme_get_me_mask(void)
+{
+ return RIP_REL_REF(sme_me_mask);
+}
+
#define __bss_decrypted __section(".bss..decrypted")
#else /* !CONFIG_AMD_MEM_ENCRYPT */
@@ -73,10 +80,9 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
-static inline void __init sev_setup_arch(void) { }
-static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
-static inline void __init sme_enable(struct boot_params *bp) { }
+static inline void sme_encrypt_kernel(struct boot_params *bp) { }
+static inline void sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
@@ -89,6 +95,8 @@ early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool en
static inline void mem_encrypt_free_decrypted_mem(void) { }
+static inline u64 sme_get_me_mask(void) { return 0; }
+
#define __bss_decrypted
#endif /* CONFIG_AMD_MEM_ENCRYPT */
@@ -106,11 +114,6 @@ void add_encrypt_protection_map(void);
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-static inline u64 sme_get_me_mask(void)
-{
- return sme_me_mask;
-}
-
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index bbbe9d744..695e56915 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -23,6 +23,8 @@ static inline void load_ucode_ap(void) { }
static inline void microcode_bsp_resume(void) { }
#endif
+extern unsigned long initrd_start_early;
+
#ifdef CONFIG_CPU_SUP_INTEL
/* Intel specific microcode defines. Public for IFS */
struct microcode_header_intel {
@@ -36,7 +38,8 @@ struct microcode_header_intel {
unsigned int datasize;
unsigned int totalsize;
unsigned int metasize;
- unsigned int reserved[2];
+ unsigned int min_req_ver;
+ unsigned int reserved;
};
struct microcode_intel {
@@ -68,11 +71,19 @@ static inline u32 intel_get_microcode_revision(void)
return rev;
}
+#endif /* !CONFIG_CPU_SUP_INTEL */
-void show_ucode_info_early(void);
+bool microcode_nmi_handler(void);
+void microcode_offline_nmi_handler(void);
-#else /* CONFIG_CPU_SUP_INTEL */
-static inline void show_ucode_info_early(void) { }
-#endif /* !CONFIG_CPU_SUP_INTEL */
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static __always_inline bool microcode_nmi_handler_enabled(void)
+{
+ return static_branch_unlikely(&microcode_nmi_handler_enable);
+}
+#else
+static __always_inline bool microcode_nmi_handler_enabled(void) { return false; }
+#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index f46df8349..4b0f98a8d 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -37,7 +37,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-extern unsigned int boot_cpu_physical_apicid;
+extern u32 boot_cpu_physical_apicid;
extern u8 boot_cpu_apic_version;
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 896445edc..ce4ce8720 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -276,11 +276,11 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-int hv_snp_boot_ap(int cpu, unsigned long start_ip);
+int hv_snp_boot_ap(u32 cpu, unsigned long start_ip);
#else
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; }
+static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; }
#endif
#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 389f95947..d65f8ce6b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -50,10 +50,13 @@
#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */
+#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT)
/* A mask for bits which the kernel toggles when controlling mitigations */
#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
- | SPEC_CTRL_RRSBA_DIS_S)
+ | SPEC_CTRL_RRSBA_DIS_S \
+ | SPEC_CTRL_BHI_DIS_S)
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
@@ -152,6 +155,10 @@
* are restricted to targets in
* kernel.
*/
+#define ARCH_CAP_BHI_NO BIT(20) /*
+ * CPU is not affected by Branch
+ * History Injection.
+ */
#define ARCH_CAP_PBRSB_NO BIT(24) /*
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
@@ -165,6 +172,14 @@
* CPU is not vulnerable to Gather
* Data Sampling (GDS).
*/
+#define ARCH_CAP_RFDS_NO BIT(27) /*
+ * Not susceptible to Register
+ * File Data Sampling.
+ */
+#define ARCH_CAP_RFDS_CLEAR BIT(28) /*
+ * VERW clears CPU Register
+ * File.
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
@@ -222,6 +237,7 @@
#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9)
#define MSR_LBR_NHM_FROM 0x00000680
#define MSR_LBR_NHM_TO 0x000006c0
@@ -236,6 +252,11 @@
#define LBR_INFO_CYCLES 0xffff
#define LBR_INFO_BR_TYPE_OFFSET 56
#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET)
+#define LBR_INFO_BR_CNTR_OFFSET 32
+#define LBR_INFO_BR_CNTR_NUM 4
+#define LBR_INFO_BR_CNTR_BITS 2
+#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0)
+#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0)
#define MSR_ARCH_LBR_CTL 0x000014ce
#define ARCH_LBR_CTL_LBREN BIT(0)
@@ -535,6 +556,9 @@
#define MSR_RELOAD_PMC0 0x000014c1
#define MSR_RELOAD_FIXED_CTR0 0x00001309
+/* KeyID partitioning between MKTME and TDX */
+#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087
+
/*
* AMD64 MSRs. Not complete. See the architecture manual for a more
* complete list.
@@ -639,12 +663,16 @@
#define MSR_AMD64_LBR_SELECT 0xc000010e
/* Zen4 */
-#define MSR_ZEN4_BP_CFG 0xc001102e
+#define MSR_ZEN4_BP_CFG 0xc001102e
#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
+/* Fam 19h MSRs */
+#define MSR_F19H_UMC_PERF_CTL 0xc0010800
+#define MSR_F19H_UMC_PERF_CTR 0xc0010801
+
/* Zen 2 */
-#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3
-#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1)
+#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3
+#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1)
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
@@ -1118,12 +1146,16 @@
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
-/* AMD-V MSRs */
+/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
#define MSR_VM_IGNNE 0xc0010115
#define MSR_VM_HSAVE_PA 0xc0010117
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
/* Hardware Feedback Interface */
#define MSR_IA32_HW_FEEDBACK_PTR 0x17d0
#define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index bae838105..920426d69 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -87,6 +87,15 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
:: "a" (eax), "b" (ebx), "c" (ecx));
}
+/*
+ * Re-enable interrupts right upon calling mwait in such a way that
+ * no interrupt can fire _before_ the execution of mwait, ie: no
+ * instruction must be placed between "sti" and "mwait".
+ *
+ * This is necessary because if an interrupt queues a timer before
+ * executing mwait, it would otherwise go unnoticed and the next tick
+ * would not be reprogrammed accordingly before mwait ever wakes up.
+ */
static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 0396458c2..3ff081a74 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -49,7 +49,7 @@
* but there is still a cushion vs. the RSB depth. The algorithm does not
* claim to be perfect and it can be speculated around by the CPU, but it
* is considered that it obfuscates the problem enough to make exploitation
- * extremly difficult.
+ * extremely difficult.
*/
#define RET_DEPTH_SHIFT 5
#define RSB_RET_STUFF_LOOPS 16
@@ -208,7 +208,7 @@
/*
* Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
- * eventually turn into it's own annotation.
+ * eventually turn into its own annotation.
*/
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
@@ -271,11 +271,20 @@
.Lskip_rsb_\@:
.endm
+/*
+ * The CALL to srso_alias_untrain_ret() must be patched in directly at
+ * the spot where untraining must be done, ie., srso_alias_untrain_ret()
+ * must be the target of a CALL instruction instead of indirectly
+ * jumping to a wrapper which then calls it. Therefore, this macro is
+ * called outside of __UNTRAIN_RET below, for the time being, before the
+ * kernel can support nested alternatives with arbitrary nesting.
+ */
+.macro CALL_UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
-#define CALL_UNTRAIN_RET "call entry_untrain_ret"
-#else
-#define CALL_UNTRAIN_RET ""
+ ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
+ "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
+.endm
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
@@ -288,38 +297,24 @@
* As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
* where we have a stack but before any RET instruction.
*/
-.macro UNTRAIN_RET
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
- defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
+.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
+#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
VALIDATE_UNRET_END
- ALTERNATIVE_3 "", \
- CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
- "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
- __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+ CALL_UNTRAIN_RET
+ ALTERNATIVE_2 "", \
+ "call entry_ibpb", \ibpb_feature, \
+ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
#endif
.endm
-.macro UNTRAIN_RET_VM
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
- defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
- VALIDATE_UNRET_END
- ALTERNATIVE_3 "", \
- CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
- "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \
- __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
-#endif
-.endm
+#define UNTRAIN_RET \
+ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH)
-.macro UNTRAIN_RET_FROM_CALL
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
- defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
- VALIDATE_UNRET_END
- ALTERNATIVE_3 "", \
- CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
- "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
- __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
-#endif
-.endm
+#define UNTRAIN_RET_VM \
+ __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH)
+
+#define UNTRAIN_RET_FROM_CALL \
+ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL)
.macro CALL_DEPTH_ACCOUNT
@@ -329,6 +324,30 @@
#endif
.endm
+/*
+ * Macro to execute VERW instruction that mitigate transient data sampling
+ * attacks such as MDS. On affected systems a microcode update overloaded VERW
+ * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
+ *
+ * Note: Only the memory operand variant of VERW clears the CPU buffers.
+ */
+.macro CLEAR_CPU_BUFFERS
+ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
+.endm
+
+#ifdef CONFIG_X86_64
+.macro CLEAR_BRANCH_HISTORY
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
+.endm
+
+.macro CLEAR_BRANCH_HISTORY_VMEXIT
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT
+.endm
+#else
+#define CLEAR_BRANCH_HISTORY
+#define CLEAR_BRANCH_HISTORY_VMEXIT
+#endif
+
#else /* __ASSEMBLY__ */
#define ANNOTATE_RETPOLINE_SAFE \
@@ -348,26 +367,37 @@ extern void __x86_return_thunk(void);
static inline void __x86_return_thunk(void) {}
#endif
+#ifdef CONFIG_CPU_UNRET_ENTRY
extern void retbleed_return_thunk(void);
+#else
+static inline void retbleed_return_thunk(void) {}
+#endif
+
+extern void srso_alias_untrain_ret(void);
+
+#ifdef CONFIG_CPU_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
+#else
+static inline void srso_return_thunk(void) {}
+static inline void srso_alias_return_thunk(void) {}
+#endif
-extern void retbleed_untrain_ret(void);
-extern void srso_untrain_ret(void);
-extern void srso_alias_untrain_ret(void);
+extern void retbleed_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
extern void entry_untrain_ret(void);
extern void entry_ibpb(void);
+#ifdef CONFIG_X86_64
+extern void clear_bhb_loop(void);
+#endif
+
extern void (*x86_return_thunk)(void);
#ifdef CONFIG_CALL_DEPTH_TRACKING
-extern void __x86_return_skl(void);
-
-static inline void x86_set_skl_return_thunk(void)
-{
- x86_return_thunk = &__x86_return_skl;
-}
+extern void call_depth_return_thunk(void);
#define CALL_DEPTH_ACCOUNT \
ALTERNATIVE("", \
@@ -380,12 +410,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count);
DECLARE_PER_CPU(u64, __x86_stuffs_count);
DECLARE_PER_CPU(u64, __x86_ctxsw_count);
#endif
-#else
-static inline void x86_set_skl_return_thunk(void) {}
+#else /* !CONFIG_CALL_DEPTH_TRACKING */
+static inline void call_depth_return_thunk(void) {}
#define CALL_DEPTH_ACCOUNT ""
-#endif
+#endif /* CONFIG_CALL_DEPTH_TRACKING */
#ifdef CONFIG_RETPOLINE
@@ -538,13 +568,14 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-DECLARE_STATIC_KEY_FALSE(mds_user_clear);
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+extern u16 mds_verw_sel;
+
#include <asm/segment.h>
/**
@@ -571,17 +602,6 @@ static __always_inline void mds_clear_cpu_buffers(void)
}
/**
- * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
- *
- * Clear CPU buffers if the corresponding static key is enabled
- */
-static __always_inline void mds_user_clear_cpu_buffers(void)
-{
- if (static_branch_likely(&mds_user_clear))
- mds_clear_cpu_buffers();
-}
-
-/**
* mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
*
* Clear CPU buffers if the corresponding static key is enabled
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index d18e5c332..1b93ff80b 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* virt_addr_valid(kaddr) returns true.
*/
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+static __always_inline void *pfn_to_kaddr(unsigned long pfn)
+{
+ return __va(pfn << PAGE_SHIFT);
+}
+
static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6c8ff1214..d4eb9e1d6 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -6,6 +6,10 @@
#include <asm/paravirt_types.h>
+#ifndef __ASSEMBLY__
+struct mm_struct;
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
@@ -142,8 +146,7 @@ static inline void write_cr0(unsigned long x)
static __always_inline unsigned long read_cr2(void)
{
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
- "mov %%cr2, %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr2, %%rax;", ALT_NOT_XEN);
}
static __always_inline void write_cr2(unsigned long x)
@@ -154,13 +157,12 @@ static __always_inline void write_cr2(unsigned long x)
static inline unsigned long __read_cr3(void)
{
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_ALT_VCALL1(mmu.write_cr3, x,
- "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
}
static inline void __write_cr4(unsigned long x)
@@ -182,7 +184,7 @@ extern noinstr void pv_native_wbinvd(void);
static __always_inline void wbinvd(void)
{
- PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}
static inline u64 paravirt_read_msr(unsigned msr)
@@ -390,27 +392,25 @@ static inline void paravirt_release_p4d(unsigned long pfn)
static inline pte_t __pte(pteval_t val)
{
return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pteval_t pte_val(pte_t pte)
{
return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline pgd_t __pgd(pgdval_t val)
{
return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -444,14 +444,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline pmd_t __pmd(pmdval_t val)
{
return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -464,7 +463,7 @@ static inline pud_t __pud(pudval_t val)
pudval_t ret;
ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (pud_t) { ret };
}
@@ -472,7 +471,7 @@ static inline pud_t __pud(pudval_t val)
static inline pudval_t pud_val(pud_t pud)
{
return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void pud_clear(pud_t *pudp)
@@ -492,8 +491,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
static inline p4d_t __p4d(p4dval_t val)
{
p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (p4d_t) { ret };
}
@@ -501,7 +499,7 @@ static inline p4d_t __p4d(p4dval_t val)
static inline p4dval_t p4d_val(p4d_t p4d)
{
return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@ -687,17 +685,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
static __always_inline unsigned long arch_local_save_flags(void)
{
return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_disable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_enable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}
static __always_inline unsigned long arch_local_irq_save(void)
@@ -726,52 +724,25 @@ static __always_inline unsigned long arch_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
-#define DEFINE_PARAVIRT_ASM(func, instr, sec) \
- asm (".pushsection " #sec ", \"ax\"\n" \
- ".global " #func "\n\t" \
- ".type " #func ", @function\n\t" \
- ASM_FUNC_ALIGN "\n" \
- #func ":\n\t" \
- ASM_ENDBR \
- instr "\n\t" \
- ASM_RET \
- ".size " #func ", . - " #func "\n\t" \
- ".popsection")
-
extern void default_banner(void);
void native_pv_lock_init(void) __init;
#else /* __ASSEMBLY__ */
-#define _PVSITE(ptype, ops, word, algn) \
-771:; \
- ops; \
-772:; \
- .pushsection .parainstructions,"a"; \
- .align algn; \
- word 771b; \
- .byte ptype; \
- .byte 772b-771b; \
- _ASM_ALIGN; \
- .popsection
-
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
-#define PARA_PATCH(off) ((off) / 8)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- ANNOTATE_RETPOLINE_SAFE;
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
.endm
-#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
- ALT_NOT(X86_FEATURE_XENPV)
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \
+ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \
+ "pushf; pop %rax;", ALT_NOT_XEN
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 772d03487..8d4fbe1be 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,18 +2,10 @@
#ifndef _ASM_X86_PARAVIRT_TYPES_H
#define _ASM_X86_PARAVIRT_TYPES_H
-#ifndef __ASSEMBLY__
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 type; /* type of this instruction */
- u8 len; /* length of original instruction */
-};
-#endif
-
#ifdef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
+#include <linux/types.h>
#include <asm/desc_defs.h>
#include <asm/pgtable_types.h>
@@ -250,43 +242,11 @@ struct paravirt_patch_template {
extern struct pv_info pv_info;
extern struct paravirt_patch_template pv_ops;
-#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op) \
- [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "m" (pv_ops.op)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type) \
- "771:\n\t" insn_string "\n" "772:\n" \
- ".pushsection .parainstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR " 771b\n" \
- " .byte " type "\n" \
- " .byte 772b-771b\n" \
- _ASM_ALIGN "\n" \
- ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]")
-
-/* Simple instruction patching code. */
-#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
+#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op)
int paravirt_disable_iospace(void);
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
+/* This generates an indirect call based on the operation type number. */
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
@@ -319,12 +279,6 @@ int paravirt_disable_iospace(void);
* However, x86_64 also has to clobber all caller saved registers, which
* unfortunately, are quite a bit (r8 - r11)
*
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
* Unfortunately there's no way to get gcc to generate the args setup
* for the call, and then allow the call itself to be generated by an
* inline asm. Because of this, we must do the complete arg setup and
@@ -423,14 +377,27 @@ int paravirt_disable_iospace(void);
__mask & __eax; \
})
-
+/*
+ * Use alternative patching for paravirt calls:
+ * - For replacing an indirect call with a direct one, use the "normal"
+ * ALTERNATIVE() macro with the indirect call as the initial code sequence,
+ * which will be replaced with the related direct call by using the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ * - In case the replacement is either a direct call or a short code sequence
+ * depending on a feature bit, the ALTERNATIVE_2() macro is being used.
+ * The indirect call is the initial code sequence again, while the special
+ * code sequence is selected with the specified feature bit. In case the
+ * feature is not active, the direct call is used as above via the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ */
#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
+ ALT_CALL_ALWAYS) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
@@ -441,10 +408,11 @@ int paravirt_disable_iospace(void);
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
- alt, cond) \
+ asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \
+ ALT_CALL_INSTR, ALT_CALL_ALWAYS, \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
@@ -542,8 +510,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-void _paravirt_nop(void);
-void paravirt_BUG(void);
unsigned long paravirt_ret0(void);
#ifdef CONFIG_PARAVIRT_XXL
u64 _paravirt_ident_64(u64);
@@ -553,11 +519,11 @@ void pv_native_irq_enable(void);
unsigned long pv_native_read_cr2(void);
#endif
-#define paravirt_nop ((void *)_paravirt_nop)
-
-extern struct paravirt_patch_site __parainstructions[],
- __parainstructions_end[];
+#define paravirt_nop ((void *)nop_func)
#endif /* __ASSEMBLY__ */
+
+#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV)
+
#endif /* CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730..5e01883eb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -24,8 +24,8 @@
#else /* ...!ASSEMBLY */
-#include <linux/kernel.h>
#include <linux/stringify.h>
+#include <asm/asm.h>
#ifdef CONFIG_SMP
#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
+ __percpu_arg([var])) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (_var) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ likely(success); \
+})
+
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
@@ -223,26 +242,63 @@ do { \
old__.var = _oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
: [var] "+m" (_var), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
- "c" (new__.high) \
- : "memory", "esi"); \
+ "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
\
old__.var; \
})
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u64 *_oval = (u64 *)(_ovalp); \
+ union { \
+ u64 var; \
+ struct { \
+ u32 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
#endif
#ifdef CONFIG_X86_64
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
({ \
union { \
@@ -255,20 +311,54 @@ do { \
old__.var = _oval; \
new__.var = _nval; \
\
- asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
: [var] "+m" (_var), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
- "c" (new__.high) \
- : "memory", "rsi"); \
+ "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
\
old__.var; \
})
#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u128 *_oval = (u128 *)(_ovalp); \
+ union { \
+ u128 var; \
+ struct { \
+ u64 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high), \
+ "S" (&(_var)) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
#endif
/*
@@ -343,6 +433,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +443,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +460,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +470,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 85a9fd5a3..3736b8a46 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -31,6 +31,7 @@
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
+#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35)
#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
@@ -112,6 +113,13 @@
(AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \
AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
+#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31)
+#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0)
+#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8)
+#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \
+ (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \
+ AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC)
+
#define AMD64_NUM_COUNTERS 4
#define AMD64_NUM_COUNTERS_CORE 6
#define AMD64_NUM_COUNTERS_NB 4
@@ -216,6 +224,9 @@ union cpuid28_ecx {
unsigned int lbr_timed_lbr:1;
/* Branch Type Field Supported */
unsigned int lbr_br_type:1;
+ unsigned int reserved:13;
+ /* Branch counters (Event Logging) Supported */
+ unsigned int lbr_counters:4;
} split;
unsigned int full;
};
@@ -232,6 +243,8 @@ union cpuid_0x80000022_ebx {
unsigned int lbr_v2_stack_sz:6;
/* Number of Data Fabric Counters */
unsigned int num_df_pmc:6;
+ /* Number of Unified Memory Controller Counters */
+ unsigned int num_umc_pmc:6;
} split;
unsigned int full;
};
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e02b179ec..9d077bca6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
@@ -1679,12 +1680,6 @@ static inline bool arch_has_pfn_modify_check(void)
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_has_hw_pte_young arch_has_hw_pte_young
-static inline bool arch_has_hw_pte_young(void)
-{
- return true;
-}
-
#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
@@ -1716,6 +1711,14 @@ static inline bool pud_user_accessible_page(pud_t pud)
}
#endif
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index a629b1b9f..24af25b15 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -203,7 +203,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
* F (2) in swp entry is used to record when a pagetable is
* writeprotected by userfaultfd WP support.
*
- * E (3) in swp entry is used to rememeber PG_anon_exclusive.
+ * E (3) in swp entry is used to remember PG_anon_exclusive.
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0b748ee16..b78644962 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -148,7 +148,7 @@
#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | \
_PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
- _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP)
+ _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
@@ -173,6 +173,7 @@ enum page_cache_mode {
};
#endif
+#define _PAGE_CC (_AT(pteval_t, cc_mask))
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
@@ -566,6 +567,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1..af77235fd 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -6,7 +6,6 @@
#include <asm/percpu.h>
#include <asm/current.h>
-#include <linux/thread_info.h>
#include <linux/static_call_types.h>
/* We use the MSB mostly because its available */
@@ -31,11 +30,11 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+ } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a3669a777..5636ad697 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,11 +75,36 @@ extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
/*
- * CPU type and hardware bug flags. Kept separately for each CPU.
- * Members of this structure are referenced in head_32.S, so think twice
- * before touching them. [mj]
+ * CPU type and hardware bug flags. Kept separately for each CPU.
*/
+struct cpuinfo_topology {
+ // Real APIC ID read from the local APIC
+ u32 apicid;
+ // The initial APIC ID provided by CPUID
+ u32 initial_apicid;
+
+ // Physical package ID
+ u32 pkg_id;
+
+ // Physical die ID on AMD, Relative on Intel
+ u32 die_id;
+
+ // Compute unit ID - AMD specific
+ u32 cu_id;
+
+ // Core ID relative to the package
+ u32 core_id;
+
+ // Logical ID mappings
+ u32 logical_pkg_id;
+ u32 logical_die_id;
+
+ // Cache level topology IDs
+ u32 llc_id;
+ u32 l2c_id;
+};
+
struct cpuinfo_x86 {
__u8 x86; /* CPU family */
__u8 x86_vendor; /* CPU vendor */
@@ -96,7 +121,6 @@ struct cpuinfo_x86 {
__u8 x86_phys_bits;
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
- __u8 cu_id;
/* Max extended CPUID function supported: */
__u32 extended_cpuid_level;
/* Maximum supported CPUID level, -1=no CPUID: */
@@ -112,6 +136,7 @@ struct cpuinfo_x86 {
};
char x86_vendor_id[16];
char x86_model_id[64];
+ struct cpuinfo_topology topo;
/* in KB - valid for CPUS which support this call: */
unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
@@ -125,19 +150,9 @@ struct cpuinfo_x86 {
u64 ppin;
/* cpuid returned max cores value: */
u16 x86_max_cores;
- u16 apicid;
- u16 initial_apicid;
u16 x86_clflush_size;
/* number of cores as seen by the OS: */
u16 booted_cores;
- /* Physical processor id: */
- u16 phys_proc_id;
- /* Logical processor id: */
- u16 logical_proc_id;
- /* Core id: */
- u16 cpu_core_id;
- u16 cpu_die_id;
- u16 logical_die_id;
/* Index into per_cpu list: */
u16 cpu_index;
/* Is SMT active on this core? */
@@ -399,7 +414,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
-extern asmlinkage void ignore_sysret(void);
+extern asmlinkage void entry_SYSCALL32_ignore(void);
/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);
@@ -464,7 +479,6 @@ struct thread_struct {
unsigned long iopl_emul;
unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;
/*
* Protection Keys Register for Userspace. Loaded immediately on
@@ -678,7 +692,15 @@ extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
-extern u16 get_llc_id(unsigned int cpu);
+static inline u32 per_cpu_llc_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.llc_id, cpu);
+}
+
+static inline u32 per_cpu_l2c_id(unsigned int cpu)
+{
+ return per_cpu(cpu_info.topo.l2c_id, cpu);
+}
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
@@ -724,14 +746,24 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
-#ifdef CONFIG_X86_SGX
-int arch_memory_failure(unsigned long pfn, int flags);
-#define arch_memory_failure arch_memory_failure
-
-bool arch_is_platform_page(u64 paddr);
-#define arch_is_platform_page arch_is_platform_page
-#endif
-
extern bool gds_ucode_mitigated(void);
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b716d291d..65dee2420 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -31,6 +31,11 @@ static inline void x86_dtb_init(void) { }
#define of_ioapic 0
#endif
+#ifdef CONFIG_OF_EARLY_FLATTREE
+void x86_flattree_get_config(void);
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
extern char cmd_line[COMMAND_LINE_SIZE];
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 84294b66b..484f4f013 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -32,6 +32,9 @@ void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
void entry_SYSRETL_compat_unsafe_stack(void);
void entry_SYSRETL_compat_end(void);
+#else /* !CONFIG_IA32_EMULATION */
+#define entry_SYSCALL_compat NULL
+#define entry_SYSENTER_compat NULL
#endif
void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 85b6e3609..ef9697f20 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
"pop %rdx\n\t" \
FRAME_END
-DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
- PV_UNLOCK_ASM, .spinlock.text);
+DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
+ PV_UNLOCK_ASM, .spinlock.text);
#else /* CONFIG_64BIT */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 7ba1726b7..e9187ddd3 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -99,6 +99,7 @@
#define REQUIRED_MASK18 0
#define REQUIRED_MASK19 0
#define REQUIRED_MASK20 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define REQUIRED_MASK21 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 4b081e0d3..363266cbc 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -13,7 +13,7 @@
#define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \
({ \
bool c = false; \
- asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
+ asm goto (fullop "; j" #cc " %l[cc_label]" \
: : [var] "m" (_var), ## __VA_ARGS__ \
: clobbers : cc_label); \
if (0) { \
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index a5e89641b..9aee31862 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -47,6 +47,7 @@ int set_memory_uc(unsigned long addr, int numpages);
int set_memory_wc(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
+int set_memory_p(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f3495623a..5c83729c8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -31,8 +31,6 @@
#include <asm/bootparam.h>
#include <asm/x86_init.h>
-extern u64 relocated_ramdisk;
-
/* Interrupt control for vSMPowered x86_64 systems */
#ifdef CONFIG_X86_64
void vsmp_init(void);
@@ -126,6 +124,7 @@ void clear_bss(void);
#ifdef __i386__
asmlinkage void __init __noreturn i386_start_kernel(void);
+void __init mk_early_pgtbl_32(void);
#else
asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 5b4a1ce3d..75a5388d4 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -199,16 +199,16 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
struct snp_guest_request_ioctl;
void setup_ghcb(void);
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
-void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
+void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
+void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
-void __init __noreturn snp_abort(void);
+void __noreturn snp_abort(void);
+void snp_dmi_setup(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
@@ -227,12 +227,12 @@ static inline void __init
early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
static inline void __init
early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
-static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
+static inline void snp_dmi_setup(void) { }
static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 7513b3bb6..fdfd41511 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -11,45 +11,91 @@
#define TDX_IDENT "IntelTDX "
/* TDX module Call Leaf IDs */
-#define TDX_GET_INFO 1
-#define TDX_GET_VEINFO 3
-#define TDX_GET_REPORT 4
-#define TDX_ACCEPT_PAGE 6
-#define TDX_WR 8
+#define TDG_VP_VMCALL 0
+#define TDG_VP_INFO 1
+#define TDG_VP_VEINFO_GET 3
+#define TDG_MR_REPORT 4
+#define TDG_MEM_PAGE_ACCEPT 6
+#define TDG_VM_WR 8
/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
+#define TDVMCALL_GET_QUOTE 0x10002
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
+#define TDVMCALL_STATUS_RETRY 1
+
+/*
+ * Bitmasks of exposed registers (with VMM).
+ */
+#define TDX_RDX BIT(2)
+#define TDX_RBX BIT(3)
+#define TDX_RSI BIT(6)
+#define TDX_RDI BIT(7)
+#define TDX_R8 BIT(8)
+#define TDX_R9 BIT(9)
+#define TDX_R10 BIT(10)
+#define TDX_R11 BIT(11)
+#define TDX_R12 BIT(12)
+#define TDX_R13 BIT(13)
+#define TDX_R14 BIT(14)
+#define TDX_R15 BIT(15)
+
+/*
+ * These registers are clobbered to hold arguments for each
+ * TDVMCALL. They are safe to expose to the VMM.
+ * Each bit in this mask represents a register ID. Bit field
+ * details can be found in TDX GHCI specification, section
+ * titled "TDCALL [TDG.VP.VMCALL] leaf".
+ */
+#define TDVMCALL_EXPOSE_REGS_MASK \
+ (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
+ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15)
+
+/* TDX supported page sizes from the TDX module ABI. */
+#define TDX_PS_4K 0
+#define TDX_PS_2M 1
+#define TDX_PS_1G 2
+#define TDX_PS_NR (TDX_PS_1G + 1)
+
#ifndef __ASSEMBLY__
+#include <linux/compiler_attributes.h>
+
/*
- * Used in __tdx_hypercall() to pass down and get back registers' values of
- * the TDCALL instruction when requesting services from the VMM.
- *
- * This is a software only structure and not part of the TDX module/VMM ABI.
+ * Used in __tdcall*() to gather the input/output registers' values of the
+ * TDCALL instruction when requesting services from the TDX module. This is a
+ * software only structure and not part of the TDX module/VMM ABI
*/
-struct tdx_hypercall_args {
+struct tdx_module_args {
+ /* callee-clobbered */
+ u64 rcx;
+ u64 rdx;
u64 r8;
u64 r9;
+ /* extra callee-clobbered */
u64 r10;
u64 r11;
+ /* callee-saved + rdi/rsi */
u64 r12;
u64 r13;
u64 r14;
u64 r15;
+ u64 rbx;
u64 rdi;
u64 rsi;
- u64 rbx;
- u64 rdx;
};
+/* Used to communicate with the TDX module */
+u64 __tdcall(u64 fn, struct tdx_module_args *args);
+u64 __tdcall_ret(u64 fn, struct tdx_module_args *args);
+u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args);
+
/* Used to request services from the VMM */
-u64 __tdx_hypercall(struct tdx_hypercall_args *args);
-u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args);
+u64 __tdx_hypercall(struct tdx_module_args *args);
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
@@ -57,7 +103,7 @@ u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args);
*/
static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
{
- struct tdx_hypercall_args args = {
+ struct tdx_module_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = fn,
.r12 = r12,
@@ -71,25 +117,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
/* Called from __tdx_hypercall() for unrecoverable failure */
-void __tdx_hypercall_failed(void);
-
-/*
- * Used in __tdx_module_call() to gather the output registers' values of the
- * TDCALL instruction when requesting services from the TDX module. This is a
- * software only structure and not part of the TDX module/VMM ABI
- */
-struct tdx_module_output {
- u64 rcx;
- u64 rdx;
- u64 r8;
- u64 r9;
- u64 r10;
- u64 r11;
-};
-
-/* Used to communicate with the TDX module */
-u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
- struct tdx_module_output *out);
+void __noreturn __tdx_hypercall_failed(void);
bool tdx_accept_memory(phys_addr_t start, phys_addr_t end);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index c31c63341..4fab2ed45 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,10 +17,8 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
/* cpus sharing the last level cache: */
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
-DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
-DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
-DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
struct task_struct;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1be13b2df..64df897c0 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,8 +37,6 @@ extern int phys_to_target_node(phys_addr_t start);
#define phys_to_target_node phys_to_target_node
extern int memory_add_physaddr_to_nid(u64 start);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-extern int numa_fill_memblks(u64 start, u64 end);
-#define numa_fill_memblks numa_fill_memblks
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index cb0386fc4..c648502e4 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -4,6 +4,7 @@
#include <linux/thread_info.h>
#include <asm/nospec-branch.h>
+#include <asm/msr.h>
/*
* On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
@@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
}
+/*
+ * This can be used in noinstr functions & should only be called in bare
+ * metal context.
+ */
+static __always_inline void __update_spec_ctrl(u64 val)
+{
+ __this_cpu_write(x86_spec_ctrl_current, val);
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
#ifdef CONFIG_SMP
extern void speculative_store_bypass_ht_init(void);
#else
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index d6cd9344f..09a5461d7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -205,7 +205,7 @@ static inline void clwb(volatile void *__p)
#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
- asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+ asm goto("1: wrussq %[val], (%[addr])\n"
_ASM_EXTABLE(1b, %l[fail])
:: [addr] "r" (addr), [val] "r" (val)
:: fail);
@@ -224,10 +224,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void __iomem *dst, const void *src)
+static inline void movdir64b(void *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } __iomem *__dst = dst;
+ struct { char _[64]; } *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -245,6 +245,11 @@ static inline void movdir64b(void __iomem *dst, const void *src)
: "m" (*__src), "a" (__dst), "d" (__src));
}
+static inline void movdir64b_io(void __iomem *dst, const void *src)
+{
+ movdir64b((void __force *)dst, src);
+}
+
/**
* enqcmds - Enqueue a command in supervisor (CPL0) mode
* @dst: destination, in MMIO space (must be 512-bit aligned)
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a800abb1a..d8416b3bf 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -12,11 +12,6 @@
/* image of the saved processor state */
struct saved_context {
- /*
- * On x86_32, all segment registers except gs are saved at kernel
- * entry in pt_regs.
- */
- u16 gs;
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
struct saved_msrs saved_msrs;
@@ -27,6 +22,11 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ /*
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
+ */
+ u16 gs;
bool misc_enable_saved;
} __attribute__((packed));
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 3ac0ffc4f..87a7b917d 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-#define SVM_VM_CR_VALID_MASK 0x001fULL
-#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
-#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
-
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2)
@@ -572,8 +568,6 @@ struct vmcb {
#define SVM_CPUID_FUNC 0x8000000a
-#define SVM_VM_CR_SVM_DISABLE 4
-
#define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_DPL_SHIFT 5
#define SVM_SELECTOR_P_SHIFT 7
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 4fb36fba4..2fc7bc386 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,19 +16,17 @@
#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
+/* This is used purely for kernel/trace/trace_syscalls.c */
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];
-#if defined(CONFIG_X86_32)
-#define ia32_sys_call_table sys_call_table
-#else
/*
* These may not exist, but still put the prototypes in so we
* can use IS_ENABLED().
*/
-extern const sys_call_ptr_t ia32_sys_call_table[];
-extern const sys_call_ptr_t x32_sys_call_table[];
-#endif
+extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
@@ -126,12 +124,13 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);
+void do_int80_emulation(struct pt_regs *regs);
#endif /* CONFIG_X86_32 */
void do_int80_syscall_32(struct pt_regs *regs);
-long do_fast_syscall_32(struct pt_regs *regs);
-long do_SYSENTER_32(struct pt_regs *regs);
+bool do_fast_syscall_32(struct pt_regs *regs);
+bool do_SYSENTER_32(struct pt_regs *regs);
#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 603e6d1e9..eba178996 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -8,6 +8,7 @@
#include <asm/errno.h>
#include <asm/ptrace.h>
+#include <asm/trapnr.h>
#include <asm/shared/tdx.h>
/*
@@ -20,8 +21,19 @@
#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
+#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP)
+#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD)
+
+/*
+ * TDX module SEAMCALL leaf function error codes
+ */
+#define TDX_SUCCESS 0ULL
+#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL
+
#ifndef __ASSEMBLY__
+#include <uapi/asm/mce.h>
+
/*
* Used by the #VE exception handler to gather the #VE exception
* info from the TDX module. This is a software only structure
@@ -52,6 +64,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs);
int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+u64 tdx_hcall_get_quote(u8 *buf, size_t size);
+
#else
static inline void tdx_early_init(void) { };
@@ -72,5 +86,42 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
return -ENODEV;
}
#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
+
+#ifdef CONFIG_INTEL_TDX_HOST
+u64 __seamcall(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
+void tdx_init(void);
+
+#include <asm/archrandom.h>
+
+typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
+
+static inline u64 sc_retry(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ int retry = RDRAND_RETRY_LOOPS;
+ u64 ret;
+
+ do {
+ ret = func(fn, args);
+ } while (ret == TDX_RND_NO_ENTROPY && --retry);
+
+ return ret;
+}
+
+#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args))
+#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args))
+#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args))
+int tdx_cpu_enable(void);
+int tdx_enable(void);
+const char *tdx_dump_mce_info(struct mce *m);
+#else
+static inline void tdx_init(void) { }
+static inline int tdx_cpu_enable(void) { return -ENODEV; }
+static inline int tdx_enable(void) { return -ENODEV; }
+static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
+#endif /* CONFIG_INTEL_TDX_HOST */
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 29832c338..0b70653a9 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -6,18 +6,6 @@
#include <linux/stddef.h>
#include <asm/ptrace.h>
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end);
-#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{}
-#define __parainstructions NULL
-#define __parainstructions_end NULL
-#endif
-
/*
* Currently, the max observed size in the kernel code is
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 3235ba1e5..5f87f6b9b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -105,17 +105,17 @@ static inline void setup_node_to_cpumask_map(void) { }
extern const struct cpumask *cpu_coregroup_mask(int cpu);
extern const struct cpumask *cpu_clustergroup_mask(int cpu);
-#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
-#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
-#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id)
-#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id)
-#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
+#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
+#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
+#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
+#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
-#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu))
+#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id)
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b1c9cea6b..1f1deaecd 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -14,7 +14,6 @@
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
-void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5c367c129..237dc8cdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -133,7 +133,7 @@ extern int __get_user_bad(void);
#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label) \
- asm_volatile_goto("\n" \
+ asm goto("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
_ASM_EXTABLE_UA(1b, %l2) \
@@ -295,7 +295,7 @@ do { \
} while (0)
#define __get_user_asm(x, addr, itype, ltype, label) \
- asm_volatile_goto("\n" \
+ asm_goto_output("\n" \
"1: mov"itype" %[umem],%[output]\n" \
_ASM_EXTABLE_UA(1b, %l2) \
: [output] ltype(x) \
@@ -375,7 +375,7 @@ do { \
__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
__typeof__(*(_ptr)) __old = *_old; \
__typeof__(*(_ptr)) __new = (_new); \
- asm_volatile_goto("\n" \
+ asm_goto_output("\n" \
"1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
_ASM_EXTABLE_UA(1b, %l[label]) \
: CC_OUT(z) (success), \
@@ -394,7 +394,7 @@ do { \
__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
__typeof__(*(_ptr)) __old = *_old; \
__typeof__(*(_ptr)) __new = (_new); \
- asm_volatile_goto("\n" \
+ asm_goto_output("\n" \
"1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
_ASM_EXTABLE_UA(1b, %l[label]) \
: CC_OUT(z) (success), \
@@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; };
* aliasing issues.
*/
#define __put_user_goto(x, addr, itype, ltype, label) \
- asm_volatile_goto("\n" \
+ asm goto("\n" \
"1: mov"itype" %0,%1\n" \
_ASM_EXTABLE_UA(1b, %l2) \
: : ltype(x), "m" (__m(addr)) \
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 5fa76c2ce..ea877fd83 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -653,7 +653,7 @@ static inline int uv_blade_to_node(int blade)
return uv_socket_to_node(blade);
}
-/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+/* Blade number of current cpu. Numbered 0 .. <#blades -1> */
static inline int uv_numa_blade_id(void)
{
return uv_hub_info->numa_blade_id;
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index c81858d90..8e048ca98 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -321,7 +321,7 @@ static __always_inline
u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
/*
- * Due to the MSB/Sign-bit being used as invald marker (see
+ * Due to the MSB/Sign-bit being used as invalid marker (see
* arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
*/
u64 delta = (cycles - last) & S64_MAX;
@@ -337,8 +337,6 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#define vdso_calc_delta vdso_calc_delta
-int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts);
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index ab60a71a8..472f0263d 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -4,6 +4,7 @@
#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>
+#include <asm/page_types.h>
#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
@@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code,
}
#endif
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static inline bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
+}
+
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5240d88db..550dcbbbb 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -30,12 +30,13 @@ struct x86_init_mpparse {
* @reserve_resources: reserve the standard resources for the
* platform
* @memory_setup: platform specific memory setup
- *
+ * @dmi_setup: platform specific DMI setup
*/
struct x86_init_resources {
void (*probe_roms)(void);
void (*reserve_resources)(void);
char *(*memory_setup)(void);
+ void (*dmi_setup)(void);
};
/**
@@ -177,7 +178,7 @@ struct x86_init_ops {
* struct x86_cpuinit_ops - platform specific cpu hotplug setups
* @setup_percpu_clockev: set up the per cpu clock event device
* @early_percpu_clock_init: early init of the per cpu clock event device
- * @fixup_cpu_id: fixup function for cpuinfo_x86::phys_proc_id
+ * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id
* @parallel_bringup: Parallel bringup control
*/
struct x86_cpuinit_ops {
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a90882507..64fbd2dbc 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,6 +62,11 @@ void xen_arch_unregister_cpu(int num);
#ifdef CONFIG_PVH
void __init xen_pvh_init(struct boot_params *boot_params);
void __init mem_map_via_hcall(struct boot_params *boot_params_p);
+#ifdef CONFIG_XEN_PVH
+void __init xen_reserve_extra_memory(struct boot_params *bootp);
+#else
+static inline void xen_reserve_extra_memory(struct boot_params *bootp) { }
+#endif
#endif
/* Lazy mode for batching updates / context switch */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index c599ec269..c10f279aa 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -61,7 +61,7 @@
* RING1 -> RING3 kernel mode.
* RING2 -> RING3 kernel mode.
* RING3 -> RING3 user mode.
- * However RING0 indicates that the guest kernel should return to iteself
+ * However RING0 indicates that the guest kernel should return to itself
* directly with
* orb $3,1*8(%rsp)
* iretq
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
index 769b93944..e5d182c73 100644
--- a/arch/x86/include/uapi/asm/amd_hsmp.h
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -47,6 +47,9 @@ enum hsmp_message_ids {
HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */
HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */
HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */
+ HSMP_GET_METRIC_TABLE_VER, /* 23h Get metrics table version */
+ HSMP_GET_METRIC_TABLE, /* 24h Get metrics table */
+ HSMP_GET_METRIC_TABLE_DRAM_ADDR,/* 25h Get metrics table dram address */
HSMP_MSG_ID_MAX,
};
@@ -64,6 +67,14 @@ enum hsmp_msg_type {
HSMP_GET = 1,
};
+enum hsmp_proto_versions {
+ HSMP_PROTO_VER2 = 2,
+ HSMP_PROTO_VER3,
+ HSMP_PROTO_VER4,
+ HSMP_PROTO_VER5,
+ HSMP_PROTO_VER6
+};
+
struct hsmp_msg_desc {
int num_args;
int response_sz;
@@ -227,7 +238,7 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
/*
* HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1
* input: args[0] = DIMM address[7:0]
- * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] +
+ * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] +
* DIMM address[7:0]
*/
{1, 1, HSMP_GET},
@@ -295,6 +306,104 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
* input: args[0] = min df pstate[15:8] + max df pstate[7:0]
*/
{1, 0, HSMP_SET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE_VER, num_args = 0, response_sz = 1
+ * output: args[0] = metrics table version
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE, num_args = 0, response_sz = 0
+ */
+ {0, 0, HSMP_GET},
+
+ /*
+ * HSMP_GET_METRIC_TABLE_DRAM_ADDR, num_args = 0, response_sz = 2
+ * output: args[0] = lower 32 bits of the address
+ * output: args[1] = upper 32 bits of the address
+ */
+ {0, 2, HSMP_GET},
+};
+
+/* Metrics table (supported only with proto version 6) */
+struct hsmp_metric_table {
+ __u32 accumulation_counter;
+
+ /* TEMPERATURE */
+ __u32 max_socket_temperature;
+ __u32 max_vr_temperature;
+ __u32 max_hbm_temperature;
+ __u64 max_socket_temperature_acc;
+ __u64 max_vr_temperature_acc;
+ __u64 max_hbm_temperature_acc;
+
+ /* POWER */
+ __u32 socket_power_limit;
+ __u32 max_socket_power_limit;
+ __u32 socket_power;
+
+ /* ENERGY */
+ __u64 timestamp;
+ __u64 socket_energy_acc;
+ __u64 ccd_energy_acc;
+ __u64 xcd_energy_acc;
+ __u64 aid_energy_acc;
+ __u64 hbm_energy_acc;
+
+ /* FREQUENCY */
+ __u32 cclk_frequency_limit;
+ __u32 gfxclk_frequency_limit;
+ __u32 fclk_frequency;
+ __u32 uclk_frequency;
+ __u32 socclk_frequency[4];
+ __u32 vclk_frequency[4];
+ __u32 dclk_frequency[4];
+ __u32 lclk_frequency[4];
+ __u64 gfxclk_frequency_acc[8];
+ __u64 cclk_frequency_acc[96];
+
+ /* FREQUENCY RANGE */
+ __u32 max_cclk_frequency;
+ __u32 min_cclk_frequency;
+ __u32 max_gfxclk_frequency;
+ __u32 min_gfxclk_frequency;
+ __u32 fclk_frequency_table[4];
+ __u32 uclk_frequency_table[4];
+ __u32 socclk_frequency_table[4];
+ __u32 vclk_frequency_table[4];
+ __u32 dclk_frequency_table[4];
+ __u32 lclk_frequency_table[4];
+ __u32 max_lclk_dpm_range;
+ __u32 min_lclk_dpm_range;
+
+ /* XGMI */
+ __u32 xgmi_width;
+ __u32 xgmi_bitrate;
+ __u64 xgmi_read_bandwidth_acc[8];
+ __u64 xgmi_write_bandwidth_acc[8];
+
+ /* ACTIVITY */
+ __u32 socket_c0_residency;
+ __u32 socket_gfx_busy;
+ __u32 dram_bandwidth_utilization;
+ __u64 socket_c0_residency_acc;
+ __u64 socket_gfx_busy_acc;
+ __u64 dram_bandwidth_acc;
+ __u32 max_dram_bandwidth;
+ __u64 dram_bandwidth_utilization_acc;
+ __u64 pcie_bandwidth_acc[4];
+
+ /* THROTTLERS */
+ __u32 prochot_residency_acc;
+ __u32 ppt_residency_acc;
+ __u32 socket_thm_residency_acc;
+ __u32 vr_thm_residency_acc;
+ __u32 hbm_thm_residency_acc;
+ __u32 spare;
+
+ /* New items at the end to maintain driver compatibility */
+ __u32 gfxclk_frequency[8];
};
/* Reset to default packing */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 01d19fc22..eeea058cf 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -38,6 +38,7 @@
#define XLF_EFI_KEXEC (1<<4)
#define XLF_5LEVEL (1<<5)
#define XLF_5LEVEL_ENABLED (1<<6)
+#define XLF_MEM_ENCRYPTION (1<<7)
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 1a6a1f987..a448d0964 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -562,4 +562,7 @@ struct kvm_pmu_event_filter {
/* x86-specific KVM_EXIT_HYPERCALL flags. */
#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+#define KVM_X86_DEFAULT_VM 0
+#define KVM_X86_SW_PROTECTED_VM 1
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index 777c3a0f4..f77734645 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -4,7 +4,6 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
-#include <linux/time.h>
#include <linux/compiler.h>
/* Avoid too many header ordering problems. */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3269a0e23..0000325ab 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
+CFLAGS_REMOVE_head32.o = -pg
CFLAGS_REMOVE_sev.o = -pg
CFLAGS_REMOVE_rethook.o = -pg
endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c55c0ef47..85a3ce2a3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
@@ -233,6 +234,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
return 0;
/*
+ * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+ * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+ * in x2APIC must be equal or greater than 0xff.
+ */
+ if (has_lapic_cpus && apic_id < 0xff)
+ return 0;
+
+ /*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
* cpus_possible_map more accurately, to permit
@@ -284,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
+ has_lapic_cpus = true;
return 0;
}
@@ -362,7 +372,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
}
#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
@@ -859,7 +869,7 @@ int acpi_unmap_cpu(int cpu)
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
#endif
- per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
set_cpu_present(cpu, false);
num_processors--;
@@ -1114,10 +1124,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
static int __init acpi_parse_madt_lapic_entries(void)
{
- int count;
- int x2count = 0;
- int ret;
- struct acpi_subtable_proc madt_proc[2];
+ int count, x2count = 0;
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
@@ -1126,21 +1133,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- memset(madt_proc, 0, sizeof(madt_proc));
- madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
- madt_proc[0].handler = acpi_parse_lapic;
- madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
- madt_proc[1].handler = acpi_parse_x2apic;
- ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
- sizeof(struct acpi_table_madt),
- madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
- if (ret < 0) {
- pr_err("Error parsing LAPIC/X2APIC entries\n");
- return ret;
- }
-
- count = madt_proc[0].count;
- x2count = madt_proc[1].count;
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+ acpi_parse_lapic, MAX_LOCAL_APIC);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
}
if (!count && !x2count) {
pr_err("No LAPIC entries present\n");
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 8d8752b44..ff8f25fac 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -20,7 +20,7 @@ bool cpc_supported_by_cpu(void)
(boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
return true;
else if (boot_cpu_data.x86 == 0x17 &&
- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
return true;
return boot_cpu_has(X86_FEATURE_CPPC);
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aae7456ec..1d85cb707 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
@@ -160,7 +161,6 @@ extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -395,6 +395,63 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
}
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -452,16 +509,21 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
continue;
}
- DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen);
+ replacement, a->replacementlen, a->flags);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
+ if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+ if (insn_buff_sz < 0)
+ continue;
+ }
+
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -842,15 +904,82 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
#endif /* CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_subprog_hash,@object \n"
+" .globl cfi_bpf_subprog_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_subprog_hash: \n"
+" .long __kcfi_typeid___bpf_callback_fn \n"
+" .size cfi_bpf_subprog_hash, 4 \n"
+" .popsection \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -1159,8 +1288,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
@@ -1421,46 +1553,6 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
-{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
- }
-}
-
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{
- struct paravirt_patch_site *p;
- char insn_buff[MAX_PATCH_LEN];
-
- for (p = start; p < end; p++) {
- unsigned int used;
-
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insn_buff, p->instr, p->len);
- used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
-
- BUG_ON(used > p->len);
-
- /* Pad the rest with nops */
- add_nops(insn_buff + used, p->len - used);
- text_poke_early(p->instr, insn_buff, p->len);
- }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
/*
* Self-test for the INT3 based CALL emulation code.
*
@@ -1596,28 +1688,11 @@ void __init alternative_instructions(void)
*/
/*
- * Paravirt patching and alternative patching can be combined to
- * replace a function call with a short direct code sequence (e.g.
- * by setting a constant return value instead of doing that in an
- * external function).
- * In order to make this work the following sequence is required:
- * 1. set (artificial) features depending on used paravirt
- * functions which can later influence alternative patching
- * 2. apply paravirt patching (generally replacing an indirect
- * function call with a direct one)
- * 3. apply alternative patching (e.g. replacing a direct function
- * call with a custom code sequence)
- * Doing paravirt patching after alternative patching would clobber
- * the optimization of the custom code with a function call again.
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
*/
paravirt_set_cap();
- /*
- * First patch paravirt functions, such that we overwrite the indirect
- * call with the direct call.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
__cfi_sites, __cfi_sites_end, true);
@@ -1628,10 +1703,6 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
- /*
- * Then patch alternatives, such that those paravirt calls that are in
- * alternatives can be overwritten by their immediate fragments.
- */
apply_alternatives(__alt_instructions, __alt_instructions_end);
/*
@@ -1906,7 +1977,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
- * trough a mutex.
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 56a917df4..2ae98f754 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -776,7 +776,7 @@ int __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index cab4d8b15..053f6dcc6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -27,6 +27,7 @@
#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
+#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
@@ -43,6 +44,7 @@
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
+#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
@@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
{}
};
@@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
{}
};
@@ -118,6 +122,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
{}
};
@@ -389,7 +394,7 @@ int amd_get_subcaches(int cpu)
pci_read_config_dword(link, 0x1d4, &mask);
- return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf;
+ return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf;
}
int amd_set_subcaches(int cpu, unsigned long mask)
@@ -415,7 +420,7 @@ int amd_set_subcaches(int cpu, unsigned long mask)
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
- cuid = cpu_data(cpu).cpu_core_id;
+ cuid = cpu_data(cpu).topo.core_id;
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 4feaa670d..89c0c8a3f 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -259,10 +259,9 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
order);
}
- /* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
break;
}
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 2ee867d79..3bf0487cf 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -4,7 +4,7 @@
#
# Leads to non-deterministic coverage that is not a function of syscall inputs.
-# In particualr, smp_apic_timer_interrupt() is called in random places.
+# In particular, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3cdf48493..a04a5163e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -72,7 +72,7 @@ unsigned int num_processors;
unsigned disabled_cpus;
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid __ro_after_init = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
u8 boot_cpu_apic_version __ro_after_init;
@@ -87,7 +87,7 @@ physid_mask_t phys_cpu_present_map;
* disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
* avoid undefined behaviour caused by sending INIT from AP to BSP.
*/
-static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
+static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID;
/*
* This variable controls which CPUs receive external NMIs. By default,
@@ -111,7 +111,7 @@ static inline bool apic_accessible(void)
/*
* Map cpu index to physical APIC ID
*/
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
@@ -782,7 +782,7 @@ bool __init apic_needs_pit(void)
/*
* If interrupt delivery mode is legacy PIC or virtual wire without
- * configuration, the local APIC timer wont be set up. Make sure
+ * configuration, the local APIC timer won't be set up. Make sure
* that the PIT is initialized.
*/
if (apic_intr_mode == APIC_PIC ||
@@ -1724,11 +1724,11 @@ static int x2apic_state;
static bool x2apic_hw_locked(void)
{
- u64 ia32_cap;
+ u64 x86_arch_cap_msr;
u64 msr;
- ia32_cap = x86_read_arch_cap_msr();
- if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) {
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
return (msr & LEGACY_XAPIC_DISABLED);
}
@@ -1765,7 +1765,7 @@ static void __x2apic_enable(void)
static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
- int apicid = native_apic_msr_read(APIC_ID);
+ u32 apicid = native_apic_msr_read(APIC_ID);
if (apicid >= 255) {
pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
@@ -1808,7 +1808,7 @@ void x2apic_setup(void)
__x2apic_enable();
}
-static __init void apic_set_fixmap(void);
+static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
@@ -1830,7 +1830,12 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
- apic_set_fixmap();
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
}
static __init void x2apic_enable(void)
@@ -2095,13 +2100,14 @@ void __init init_apic_mappings(void)
}
}
-static __init void apic_set_fixmap(void)
+static __init void apic_set_fixmap(bool read_apic)
{
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
apic_mmio_base = APIC_BASE;
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
apic_mmio_base, mp_lapic_addr);
- apic_read_boot_cpu_id(false);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
}
void __init register_lapic_address(unsigned long address)
@@ -2111,7 +2117,7 @@ void __init register_lapic_address(unsigned long address)
mp_lapic_addr = address;
if (!x2apic_mode)
- apic_set_fixmap();
+ apic_set_fixmap(true);
}
/*
@@ -2318,13 +2324,11 @@ static int nr_logical_cpuids = 1;
/*
* Used to store mapping between logical CPU IDs and APIC IDs.
*/
-int cpuid_to_apicid[] = {
- [0 ... NR_CPUS - 1] = -1,
-};
+u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, };
bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
- return phys_id == cpuid_to_apicid[cpu];
+ return phys_id == (u64)cpuid_to_apicid[cpu];
}
#ifdef CONFIG_SMP
@@ -2393,7 +2397,7 @@ static int allocate_logical_cpuid(int apicid)
return nr_logical_cpuids++;
}
-static void cpu_update_apic(int cpu, int apicid)
+static void cpu_update_apic(int cpu, u32 apicid)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
@@ -2546,7 +2550,7 @@ static struct {
*/
int active;
/* r/w apic fields */
- unsigned int apic_id;
+ u32 apic_id;
unsigned int apic_taskpri;
unsigned int apic_ldr;
unsigned int apic_dfr;
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 7bc5d9bf5..8a0014107 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -18,7 +18,7 @@ u32 apic_flat_calc_apicid(unsigned int cpu)
return 1U << cpu;
}
-bool default_check_apicid_used(physid_mask_t *map, int apicid)
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid)
{
return physid_isset(apicid, *map);
}
@@ -28,7 +28,7 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
*retmap = *phys_map;
}
-int default_cpu_present_to_apicid(int mps_cpu)
+u32 default_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 032a84e2c..b295a056a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -56,17 +56,17 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
_flat_send_IPI_mask(mask, vector);
}
-static unsigned int flat_get_apic_id(unsigned long x)
+static u32 flat_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
{
return (id & 0xFF) << 24;
}
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
@@ -82,7 +82,6 @@ static struct apic apic_flat __ro_after_init = {
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
@@ -103,6 +102,7 @@ static struct apic apic_flat __ro_after_init = {
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
@@ -153,13 +153,10 @@ static struct apic apic_physflat __ro_after_init = {
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .ioapic_phys_id_map = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.phys_pkg_id = flat_phys_pkg_id,
@@ -175,6 +172,7 @@ static struct apic apic_physflat __ro_after_init = {
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 966d7cf10..9f1d553eb 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -18,6 +18,8 @@
#include <asm/apic.h>
+#include "local.h"
+
static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
@@ -25,10 +27,10 @@ static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; }
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
static u64 noop_apic_icr_read(void) { return 0; }
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; }
-static unsigned int noop_get_apic_id(unsigned long x) { return 0; }
+static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
static void noop_apic_eoi(void) { }
static u32 noop_apic_read(u32 reg)
@@ -45,7 +47,6 @@ static void noop_apic_write(u32 reg, u32 val)
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 63f3d7be9..7d0c51b9d 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -25,7 +25,7 @@ static const struct apic apic_numachip1;
static const struct apic apic_numachip2;
static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
-static unsigned int numachip1_get_apic_id(unsigned long x)
+static u32 numachip1_get_apic_id(u32 x)
{
unsigned long value;
unsigned int id = (x >> 24) & 0xff;
@@ -38,12 +38,12 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
return id;
}
-static u32 numachip1_set_apic_id(unsigned int id)
+static u32 numachip1_set_apic_id(u32 id)
{
return (id & 0xff) << 24;
}
-static unsigned int numachip2_get_apic_id(unsigned long x)
+static u32 numachip2_get_apic_id(u32 x)
{
u64 mcfg;
@@ -51,12 +51,12 @@ static unsigned int numachip2_get_apic_id(unsigned long x)
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
-static u32 numachip2_set_apic_id(unsigned int id)
+static u32 numachip2_set_apic_id(u32 id)
{
return id << 24;
}
-static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
@@ -71,7 +71,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
}
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
{
numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
@@ -161,7 +161,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
u64 val;
u32 nodes = 1;
- this_cpu_write(cpu_llc_id, node);
+ c->topo.llc_id = node;
/* Account for nodes per socket in multi-core-module processors */
if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
@@ -169,7 +169,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
nodes = ((val >> 3) & 7) + 1;
}
- c->phys_proc_id = node / nodes;
+ c->topo.pkg_id = node / nodes;
}
static int __init numachip_system_init(void)
@@ -222,7 +222,6 @@ static const struct apic apic_numachip1 __refconst = {
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -259,7 +258,6 @@ static const struct apic apic_numachip2 __refconst = {
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0e5535add..5a0d60b38 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -13,12 +13,12 @@
#include "local.h"
-static unsigned bigsmp_get_apic_id(unsigned long x)
+static u32 bigsmp_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid)
{
return false;
}
@@ -29,7 +29,7 @@ static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *re
physids_promote(0xFFL, retmap);
}
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
}
@@ -80,7 +80,6 @@ static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 1,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b..40c7cf180 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -997,7 +997,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
/*
* Legacy ISA IRQ has already been allocated, just add pin to
* the pin list associated with this IRQ and program the IOAPIC
- * entry. The IOAPIC entry
+ * entry.
*/
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index a44ba7209..5da693d63 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -97,6 +97,14 @@ sendmask:
__apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
+void apic_send_nmi_to_offline_cpu(unsigned int cpu)
+{
+ if (WARN_ON_ONCE(!apic->nmi_to_offline_cpu))
+ return;
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, &cpus_booted_once_mask)))
+ return;
+ apic->send_IPI(cpu, NMI_VECTOR);
+}
#endif /* CONFIG_SMP */
static inline int __prepare_ICR2(unsigned int mask)
@@ -281,7 +289,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
}
#ifdef CONFIG_SMP
-static int convert_apicid_to_cpu(int apic_id)
+static int convert_apicid_to_cpu(u32 apic_id)
{
int i;
@@ -294,7 +302,8 @@ static int convert_apicid_to_cpu(int apic_id)
int safe_smp_processor_id(void)
{
- int apicid, cpuid;
+ u32 apicid;
+ int cpuid;
if (!boot_cpu_has(X86_FEATURE_APIC))
return 0;
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index ec219c659..9ea6186ea 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -15,9 +15,9 @@
/* X2APIC */
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
-unsigned int x2apic_get_apic_id(unsigned long id);
-u32 x2apic_set_apic_id(unsigned int id);
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+u32 x2apic_get_apic_id(u32 id);
+u32 x2apic_set_apic_id(u32 id);
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb);
void x2apic_send_IPI_all(int vector);
void x2apic_send_IPI_allbutself(int vector);
@@ -64,6 +64,7 @@ void default_send_IPI_all(int vector);
void default_send_IPI_self(int vector);
bool default_apic_id_registered(void);
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid);
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 9a06df6cd..c0f78059f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -18,11 +18,21 @@
#include "local.h"
-static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
}
+static u32 default_get_apic_id(u32 x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
+}
+
/* should be called last. */
static int probe_default(void)
{
@@ -35,7 +45,6 @@ static struct apic apic_default __ro_after_init = {
.probe = probe_default,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 319448d87..185738c72 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void)
void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
/*
- * Use assign system here so it wont get accounted as allocated
- * and moveable in the cpu hotplug check and it prevents managed
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
* irq reservation from touching it.
*/
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index affbff65e..28a7d3f23 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -227,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
@@ -251,6 +250,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 788cdb4ee..409815a40 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -124,17 +124,17 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
-unsigned int x2apic_get_apic_id(unsigned long id)
+u32 x2apic_get_apic_id(u32 id)
{
return id;
}
-u32 x2apic_set_apic_id(unsigned int id)
+u32 x2apic_set_apic_id(u32 id)
{
return id;
}
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb)
{
return initial_apicid >> index_msb;
}
@@ -145,7 +145,6 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -166,6 +165,7 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 205cee567..f1766b18d 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -110,7 +110,7 @@ static void __init early_get_pnodeid(void)
} else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
union uvh_rh_gam_addr_map_config_u m_n_config;
- m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
uv_cpuid.n_skt = m_n_config.s.n_skt;
if (is_uv(UV3))
uv_cpuid.m_skt = m_n_config.s3.m_skt;
@@ -701,7 +701,7 @@ static __init void build_uv_gr_table(void)
}
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
{
unsigned long val;
int pnode;
@@ -779,7 +779,7 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
{
return id;
}
@@ -789,7 +789,7 @@ static unsigned int uv_read_apic_id(void)
return x2apic_get_apic_id(apic_read(APIC_ID));
}
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb)
{
return uv_read_apic_id() >> index_msb;
}
@@ -805,7 +805,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5934ee5bc..76a5ced27 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex);
* This is for buggy BIOS's that refer to (real mode) segment 0x40
* even though they are called in protected mode.
*/
-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
(unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index dc3576303..6913b372c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -68,26 +68,19 @@ static void __used common(void)
#endif
BLANK();
- OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx);
- OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx);
- OFFSET(TDX_MODULE_r8, tdx_module_output, r8);
- OFFSET(TDX_MODULE_r9, tdx_module_output, r9);
- OFFSET(TDX_MODULE_r10, tdx_module_output, r10);
- OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
-
- BLANK();
- OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8);
- OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9);
- OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
- OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
- OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
- OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
- OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
- OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
- OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi);
- OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi);
- OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx);
- OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx);
+ OFFSET(TDX_MODULE_rcx, tdx_module_args, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_args, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_args, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_args, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_args, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_args, r11);
+ OFFSET(TDX_MODULE_r12, tdx_module_args, r12);
+ OFFSET(TDX_MODULE_r13, tdx_module_args, r13);
+ OFFSET(TDX_MODULE_r14, tdx_module_args, r14);
+ OFFSET(TDX_MODULE_r15, tdx_module_args, r15);
+ OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx);
+ OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi);
+ OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi);
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index faa9f2299..64ad2ddea 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -48,11 +48,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count);
extern s32 __call_sites[], __call_sites_end[];
-struct thunk_desc {
- void *template;
- unsigned int template_size;
-};
-
struct core_text {
unsigned long base;
unsigned long end;
@@ -238,14 +233,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_paravirt_call_sites(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end,
- const struct core_text *ct)
+patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
+ const struct core_text *ct)
{
- struct paravirt_patch_site *p;
+ struct alt_instr *a;
- for (p = start; p < end; p++)
- patch_call(p->instr, ct);
+ for (a = start; a < end; a++)
+ patch_call((void *)&a->instr_offset + a->instr_offset, ct);
}
static __init_or_module void
@@ -253,7 +247,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -262,8 +256,8 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .pv_start = __parainstructions,
- .pv_end = __parainstructions_end
+ .alt_start = __alt_instructions,
+ .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index 8674a5c0c..e6bf78fac 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -4,10 +4,10 @@
*
* Copyright (C) 2022 Google LLC
*/
-#include <asm/cfi.h>
+#include <linux/string.h>
+#include <linux/cfi.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
-#include <linux/string.h>
/*
* Returns the target address and the expected type when regs->ip points
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4350f6bfc..93eabf544 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
obj-$(CONFIG_ACRN_GUEST) += acrn.o
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6e4f23f31..0838ea579 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -34,87 +34,6 @@
*/
static u32 nodes_per_socket = 1;
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static const int amd_zenbleed[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
- AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
-
-static const int amd_div0[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
-
-static const int amd_erratum_1485[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf),
- AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -382,7 +301,7 @@ static int nearby_node(int apicid)
#endif
/*
- * Fix up cpu_core_id for pre-F17h systems to be in the
+ * Fix up topo::core_id for pre-F17h systems to be in the
* [0 .. cores_per_node - 1] range. Not really needed but
* kept so as not to break existing setups.
*/
@@ -394,7 +313,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
return;
cus_per_node = c->x86_max_cores / nodes_per_socket;
- c->cpu_core_id %= cus_per_node;
+ c->topo.core_id %= cus_per_node;
}
/*
@@ -405,8 +324,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
*/
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- int cpu = smp_processor_id();
-
/* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int err;
@@ -414,13 +331,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- c->cpu_die_id = ecx & 0xff;
+ c->topo.die_id = ecx & 0xff;
if (c->x86 == 0x15)
- c->cu_id = ebx & 0xff;
+ c->topo.cu_id = ebx & 0xff;
if (c->x86 >= 0x17) {
- c->cpu_core_id = ebx & 0xff;
+ c->topo.core_id = ebx & 0xff;
if (smp_num_siblings > 1)
c->x86_max_cores /= smp_num_siblings;
@@ -434,15 +351,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
if (!err)
c->x86_coreid_bits = get_count_order(c->x86_max_cores);
- cacheinfo_amd_init_llc_id(c, cpu);
+ cacheinfo_amd_init_llc_id(c);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.die_id = value & 7;
+ c->topo.llc_id = c->topo.die_id;
} else
return;
@@ -459,15 +375,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
unsigned bits;
- int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
+ c->topo.pkg_id = c->topo.initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+ c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
}
u32 amd_get_nodes_per_socket(void)
@@ -481,11 +396,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned apicid = c->apicid;
+ unsigned apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = get_llc_id(cpu);
+ node = per_cpu_llc_id(cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -515,7 +430,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -620,6 +535,61 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x70 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -743,15 +713,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
- /*
- * Check whether the machine is affected by erratum 400. This is
- * used to select the proper idle routine and to enable the check
- * whether the machine is affected in arch_post_acpi_init(), which
- * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
- */
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_E400);
-
early_detect_mem_encrypt(c);
/* Re-enable TopologyExtensions if switched off by BIOS */
@@ -818,6 +779,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -851,8 +822,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_ln(struct cpuinfo_x86 *c)
@@ -945,6 +925,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_CPU_UNRET_ENTRY
@@ -955,34 +948,28 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
*
* This suppresses speculation from the middle of a basic block, i.e. it
* suppresses non-branch predictions.
- *
- * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
*/
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
- /*
- * Work around Erratum 1386. The XSAVES instruction malfunctions in
- * certain circumstances on Zen1/2 uarch, and not all parts have had
- * updated microcode at the time of writing (March 2023).
- *
- * Affected parts all have no supervisor XSAVE states, meaning that
- * the XSAVEC instruction (which works fine) is equivalent.
- */
- clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
-static void init_amd_zn(struct cpuinfo_x86 *c)
+static void init_amd_zen_common(void)
{
- set_cpu_cap(c, X86_FEATURE_ZEN);
-
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
#ifdef CONFIG_NUMA
node_reclaim_distance = 32;
#endif
+}
+
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
@@ -990,15 +977,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
/* Erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
-
- /*
- * Zen3 (Fam19 model < 0x10) parts are not susceptible to
- * Branch Type Confusion, but predate the allocation of the
- * BTC_NO bit.
- */
- if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
- set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
+
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
}
static bool cpu_has_zenbleed_microcode(void)
@@ -1006,15 +988,14 @@ static bool cpu_has_zenbleed_microcode(void)
u32 good_rev = 0;
switch (boot_cpu_data.x86_model) {
- case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
- case 0x60 ... 0x67: good_rev = 0x0860010b; break;
- case 0x68 ... 0x6f: good_rev = 0x08608105; break;
- case 0x70 ... 0x7f: good_rev = 0x08701032; break;
- case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
default:
return false;
- break;
}
if (boot_cpu_data.microcode < good_rev)
@@ -1023,11 +1004,8 @@ static bool cpu_has_zenbleed_microcode(void)
return true;
}
-static void zenbleed_check(struct cpuinfo_x86 *c)
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
{
- if (!cpu_has_amd_erratum(c, amd_zenbleed))
- return;
-
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return;
@@ -1042,8 +1020,46 @@ static void zenbleed_check(struct cpuinfo_x86 *c)
}
}
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+}
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_amd(c);
/*
@@ -1060,7 +1076,7 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_FSRS);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = read_apic_id();
+ c->topo.apicid = read_apic_id();
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
@@ -1075,11 +1091,19 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_spectral_chicken(c);
- fallthrough;
- case 0x19: init_amd_zn(c); break;
}
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
/*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
@@ -1095,6 +1119,14 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
@@ -1131,7 +1163,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* Counter May Be Inaccurate".
*/
if (cpu_has(c, X86_FEATURE_IRPERF) &&
- !cpu_has_amd_erratum(c, amd_erratum_1054))
+ (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1147,16 +1179,8 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_AUTOIBRS))
WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
- zenbleed_check(c);
-
- if (cpu_has_amd_erratum(c, amd_div0)) {
- pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
- setup_force_cpu_bug(X86_BUG_DIV0);
- }
-
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
- cpu_has_amd_erratum(c, amd_erratum_1485))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
@@ -1310,7 +1334,7 @@ static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- zenbleed_check(c);
+ zen2_zenbleed_check(c);
}
void amd_check_microcode(void)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0bc55472f..e7ceee008 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -61,9 +61,11 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
EXPORT_SYMBOL_GPL(x86_pred_cmd);
+static u64 __ro_after_init x86_arch_cap_msr;
+
static DEFINE_MUTEX(spec_ctrl_mutex);
-void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
@@ -111,9 +113,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-/* Control MDS CPU buffer clear before returning to user space */
-DEFINE_STATIC_KEY_FALSE(mds_user_clear);
-EXPORT_SYMBOL_GPL(mds_user_clear);
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -147,6 +146,8 @@ void __init cpu_select_mitigations(void)
x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
}
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -252,7 +253,7 @@ static void __init mds_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
@@ -304,8 +305,6 @@ static const char * const taa_strings[] = {
static void __init taa_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
taa_mitigation = TAA_MITIGATION_OFF;
return;
@@ -344,9 +343,8 @@ static void __init taa_select_mitigation(void)
* On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
* update is required.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
- !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
/*
@@ -356,7 +354,7 @@ static void __init taa_select_mitigation(void)
* For guests that can't determine whether the correct microcode is
* present on host, enable the mitigation for UCODE_NEEDED as well.
*/
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
@@ -404,8 +402,6 @@ static const char * const mmio_strings[] = {
static void __init mmio_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) ||
cpu_mitigations_off()) {
@@ -416,15 +412,20 @@ static void __init mmio_select_mitigation(void)
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
- ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
* by MDS or TAA. Otherwise, enable mitigation for VMM only.
*/
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
boot_cpu_has(X86_FEATURE_RTM)))
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ /*
+ * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
+ * mitigations, disable KVM-only mitigation in that case.
+ */
+ if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
+ static_branch_disable(&mmio_stale_data_clear);
else
static_branch_enable(&mmio_stale_data_clear);
@@ -433,7 +434,7 @@ static void __init mmio_select_mitigation(void)
* be propagated to uncore buffers, clearing the Fill buffers on idle
* is required irrespective of SMT state.
*/
- if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
/*
@@ -443,10 +444,10 @@ static void __init mmio_select_mitigation(void)
* FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
* affected systems.
*/
- if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
(boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)))
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))
mmio_mitigation = MMIO_MITIGATION_VERW;
else
mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
@@ -477,6 +478,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str)
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ else
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "" fmt
static void __init md_clear_update_mitigation(void)
@@ -484,12 +536,12 @@ static void __init md_clear_update_mitigation(void)
if (cpu_mitigations_off())
return;
- if (!static_key_enabled(&mds_user_clear))
+ if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
goto out;
/*
- * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
- * mitigation, if necessary.
+ * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO
+ * Stale Data mitigation, if necessary.
*/
if (mds_mitigation == MDS_MITIGATION_OFF &&
boot_cpu_has_bug(X86_BUG_MDS)) {
@@ -501,11 +553,19 @@ static void __init md_clear_update_mitigation(void)
taa_mitigation = TAA_MITIGATION_VERW;
taa_select_mitigation();
}
- if (mmio_mitigation == MMIO_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ /*
+ * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
+ * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
mmio_mitigation = MMIO_MITIGATION_VERW;
mmio_select_mitigation();
}
+ if (rfds_mitigation == RFDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ rfds_select_mitigation();
+ }
out:
if (boot_cpu_has_bug(X86_BUG_MDS))
pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
@@ -515,6 +575,8 @@ out:
pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
pr_info("MMIO Stale Data: Unknown: No mitigations\n");
+ if (boot_cpu_has_bug(X86_BUG_RFDS))
+ pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
}
static void __init md_clear_select_mitigation(void)
@@ -522,11 +584,12 @@ static void __init md_clear_select_mitigation(void)
mds_select_mitigation();
taa_select_mitigation();
mmio_select_mitigation();
+ rfds_select_mitigation();
/*
- * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
- * and print their mitigation after MDS, TAA and MMIO Stale Data
- * mitigation selection is done.
+ * As these mitigations are inter-related and rely on VERW instruction
+ * to clear the microarchitural buffers, update and print their status
+ * after mitigation selection is done for each of these vulnerabilities.
*/
md_clear_update_mitigation();
}
@@ -593,8 +656,6 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return;
@@ -603,8 +664,7 @@ static void __init srbds_select_mitigation(void)
* are only exposed to SRBDS when TSX is enabled or when CPU is affected
* by Processor MMIO Stale Data vulnerability.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
@@ -717,7 +777,7 @@ void update_gds_msr(void)
case GDS_MITIGATION_UCODE_NEEDED:
case GDS_MITIGATION_HYPERVISOR:
return;
- };
+ }
wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
@@ -747,7 +807,7 @@ static void __init gds_select_mitigation(void)
/* Will verify below that mitigation _can_ be disabled */
/* No microcode */
- if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
if (gds_mitigation == GDS_MITIGATION_FORCE) {
/*
* This only needs to be done on the boot CPU so do it
@@ -1019,7 +1079,6 @@ static void __init retbleed_select_mitigation(void)
do_cmd_auto:
case RETBLEED_CMD_AUTO:
- default:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
@@ -1042,8 +1101,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
- if (IS_ENABLED(CONFIG_RETHUNK))
- x86_return_thunk = retbleed_return_thunk;
+ x86_return_thunk = retbleed_return_thunk;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1061,7 +1119,8 @@ do_cmd_auto:
case RETBLEED_MITIGATION_STUFF:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
- x86_set_skl_return_thunk();
+
+ x86_return_thunk = call_depth_return_thunk;
break;
default:
@@ -1290,6 +1349,8 @@ spectre_v2_user_select_mitigation(void)
spectre_v2_user_ibpb = mode;
switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ break;
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
@@ -1301,8 +1362,6 @@ spectre_v2_user_select_mitigation(void)
case SPECTRE_V2_USER_CMD_SECCOMP:
static_branch_enable(&switch_mm_cond_ibpb);
break;
- default:
- break;
}
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
@@ -1478,20 +1537,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
return SPECTRE_V2_RETPOLINE;
}
+static bool __ro_after_init rrsba_disabled;
+
/* Disable in-kernel use of non-RSB RET predictors */
static void __init spec_ctrl_disable_kernel_rrsba(void)
{
- u64 ia32_cap;
+ if (rrsba_disabled)
+ return;
- if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
return;
+ }
- ia32_cap = x86_read_arch_cap_msr();
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
- if (ia32_cap & ARCH_CAP_RRSBA) {
- x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- update_spec_ctrl(x86_spec_ctrl_base);
- }
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
}
static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
@@ -1541,6 +1605,74 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
dump_stack();
}
+/*
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
+ */
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_ON,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Mitigate KVM by default */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
+
+ /* Mitigate syscalls when the mitigation is forced =on */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -1652,6 +1784,9 @@ static void __init spectre_v2_select_mitigation(void)
mode == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
+ if (boot_cpu_has(X86_BUG_BHI))
+ bhi_select_mitigation();
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -1766,8 +1901,6 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1782,7 +1915,7 @@ static void update_mds_branch_idle(void)
if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
- (ia32_cap & ARCH_CAP_FBSDP_NO)) {
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
}
}
@@ -2160,6 +2293,10 @@ static int l1d_flush_prctl_get(struct task_struct *task)
static int ssb_prctl_get(struct task_struct *task)
{
switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
case SPEC_STORE_BYPASS_DISABLE:
return PR_SPEC_DISABLE;
case SPEC_STORE_BYPASS_SECCOMP:
@@ -2171,11 +2308,8 @@ static int ssb_prctl_get(struct task_struct *task)
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- default:
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- return PR_SPEC_ENABLE;
- return PR_SPEC_NOT_AFFECTED;
}
+ BUG();
}
static int ib_prctl_get(struct task_struct *task)
@@ -2410,13 +2544,21 @@ static void __init srso_select_mitigation(void)
{
bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
- goto pred_cmd;
+ if (cpu_mitigations_off())
+ return;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
if (has_microcode) {
/*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
+ *
+ * Zen1/2 don't have SBPB, no need to try to enable it here.
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
@@ -2437,7 +2579,9 @@ static void __init srso_select_mitigation(void)
switch (srso_cmd) {
case SRSO_CMD_OFF:
- goto pred_cmd;
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
case SRSO_CMD_MICROCODE:
if (has_microcode) {
@@ -2468,7 +2612,6 @@ static void __init srso_select_mitigation(void)
srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
- goto pred_cmd;
}
break;
@@ -2480,7 +2623,6 @@ static void __init srso_select_mitigation(void)
}
} else {
pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
- goto pred_cmd;
}
break;
@@ -2492,21 +2634,12 @@ static void __init srso_select_mitigation(void)
}
} else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
- goto pred_cmd;
}
break;
-
- default:
- break;
}
out:
pr_info("%s\n", srso_strings[srso_mitigation]);
-
-pred_cmd:
- if ((!boot_cpu_has_bug(X86_BUG_SRSO) || srso_cmd == SRSO_CMD_OFF) &&
- boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
}
#undef pr_fmt
@@ -2616,6 +2749,11 @@ static ssize_t mmio_stale_data_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2624,15 +2762,15 @@ static char *stibp_state(void)
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
- return ", STIBP: disabled";
+ return "; STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
- return ", STIBP: forced";
+ return "; STIBP: forced";
case SPECTRE_V2_USER_STRICT_PREFERRED:
- return ", STIBP: always-on";
+ return "; STIBP: always-on";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
- return ", STIBP: conditional";
+ return "; STIBP: conditional";
}
return "";
}
@@ -2641,10 +2779,10 @@ static char *ibpb_state(void)
{
if (boot_cpu_has(X86_FEATURE_IBPB)) {
if (static_key_enabled(&switch_mm_always_ibpb))
- return ", IBPB: always-on";
+ return "; IBPB: always-on";
if (static_key_enabled(&switch_mm_cond_ibpb))
- return ", IBPB: conditional";
- return ", IBPB: disabled";
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
}
return "";
}
@@ -2654,14 +2792,32 @@ static char *pbrsb_eibrs_state(void)
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
- return ", PBRSB-eIBRS: SW sequence";
+ return "; PBRSB-eIBRS: SW sequence";
else
- return ", PBRSB-eIBRS: Vulnerable";
+ return "; PBRSB-eIBRS: Vulnerable";
} else {
- return ", PBRSB-eIBRS: Not affected";
+ return "; PBRSB-eIBRS: Not affected";
}
}
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
@@ -2674,13 +2830,15 @@ static ssize_t spectre_v2_show_state(char *buf)
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sysfs_emit(buf, "%s%s%s%s%s%s%s\n",
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
spectre_v2_strings[spectre_v2_enabled],
ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
spectre_v2_module_string());
}
@@ -2775,6 +2933,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_GDS:
return gds_show_state(buf);
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
default:
break;
}
@@ -2849,4 +3010,9 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 8f86eacf6..c131c412d 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.llc_id = c->topo.die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
} else {
/*
* LLC ID is calculated from the number of threads sharing the
@@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
if (num_sharing_cache) {
int bits = get_count_order(num_sharing_cache);
- per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ c->topo.llc_id = c->topo.apicid >> bits;
}
}
}
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
}
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
@@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
- unsigned int cpu = c->cpu_index;
-#endif
if (c->cpuid_level > 3) {
static int is_initialized;
@@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid & ~((1 << index_msb) - 1);
+ l2_id = c->topo.apicid & ~((1 << index_msb) - 1);
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l3_id = c->apicid & ~((1 << index_msb) - 1);
+ l3_id = c->topo.apicid & ~((1 << index_msb) - 1);
break;
default:
break;
@@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l2_id;
- per_cpu(cpu_l2c_id, cpu) = l2_id;
-#endif
+ c->topo.llc_id = l2_id;
+ c->topo.l2c_id = l2_id;
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
+ c->topo.llc_id = l3_id;
}
-#ifdef CONFIG_SMP
/*
- * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * If llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
* cpuid1). Since cpuid2 doesn't specify shared caches, and we know
* that SMT shares all caches, we can unconditionally set cpu_llc_id to
- * c->phys_proc_id.
+ * c->topo.pkg_id.
*/
- if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-#endif
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
@@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
unsigned int apicid, nshared, first, last;
nshared = base->eax.split.num_threads_sharing + 1;
- apicid = cpu_data(cpu).apicid;
+ apicid = cpu_data(cpu).topo.apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
@@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
if (!this_cpu_ci->info_list)
continue;
- apicid = cpu_data(i).apicid;
+ apicid = cpu_data(i).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
- apicid = cpu_data(sibling).apicid;
+ apicid = cpu_data(sibling).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
cpumask_set_cpu(sibling,
@@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
index_msb = get_count_order(num_threads_sharing);
for_each_online_cpu(i)
- if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
if (i == cpu || !sib_cpu_ci->info_list)
@@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- id4_regs->id = c->apicid >> index_msb;
+ id4_regs->id = c->topo.apicid >> index_msb;
}
int populate_cache_leaves(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e5ffc8b0..46603c6e4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -62,9 +62,11 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
+#include <asm/ia32.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
#include "cpu.h"
@@ -74,18 +76,6 @@ u32 elf_hwcap2 __read_mostly;
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
-
-u16 get_llc_id(unsigned int cpu)
-{
- return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(get_llc_id);
-
-/* L2 cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
-
static struct ppin_info {
int feature;
int msr_ppin_ctl;
@@ -199,45 +189,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -914,7 +896,7 @@ void detect_ht(struct cpuinfo_x86 *c)
return;
index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+ c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb);
smp_num_siblings = smp_num_siblings / c->x86_max_cores;
@@ -922,8 +904,8 @@ void detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
+ c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
#endif
}
@@ -1114,18 +1096,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ bool vp_bits_from_cpuid = true;
- if (c->extended_cpuid_level >= 0x80000008) {
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008))
+ vp_bits_from_cpuid = false;
+
+ if (vp_bits_from_cpuid) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ } else {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1165,6 +1163,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define NO_SPECTRE_V2 BIT(8)
#define NO_MMIO BIT(9)
#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
@@ -1227,18 +1226,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
/* Zhaoxin Family 7 */
- VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
- VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
{}
};
@@ -1269,6 +1268,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define SRSO BIT(5)
/* CPU is affected by GDS */
#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1296,9 +1297,18 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
@@ -1317,28 +1327,46 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi
u64 x86_read_arch_cap_msr(void)
{
- u64 ia32_cap = 0;
+ u64 x86_arch_cap_msr = 0;
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+
+ return x86_arch_cap_msr;
+}
- return ia32_cap;
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
}
-static bool arch_cap_mmio_immune(u64 ia32_cap)
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
{
- return (ia32_cap & ARCH_CAP_FBSDP_NO &&
- ia32_cap & ARCH_CAP_PSDP_NO &&
- ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
}
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
- !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
@@ -1350,7 +1378,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
- !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
@@ -1358,15 +1386,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
* flag and protect from vendor-specific bugs via the whitelist.
*/
- if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
- !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
}
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
@@ -1385,9 +1413,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* TSX_CTRL check alone is not sufficient for cases when the microcode
* update is not present or running as guest that don't get TSX_CTRL.
*/
- if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
(cpu_has(c, X86_FEATURE_RTM) ||
- (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
/*
@@ -1413,7 +1441,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist,
* nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.
*/
- if (!arch_cap_mmio_immune(ia32_cap)) {
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
if (cpu_matches(cpu_vuln_blacklist, MMIO))
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO))
@@ -1421,7 +1449,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
}
if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
- if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
@@ -1439,15 +1467,25 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* disabling AVX2. The only way to do this in HW is to clear XCR0[2],
* which means that AVX will be disabled.
*/
- if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
boot_cpu_has(X86_FEATURE_AVX))
setup_force_cpu_bug(X86_BUG_GDS);
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /* When virtualized, eIBRS could be hidden, assume vulnerable */
+ if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
+ !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
- if (ia32_cap & ARCH_CAP_RDCL_NO)
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
@@ -1579,17 +1617,6 @@ static void __init cpu_parse_early_param(void)
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
@@ -1601,8 +1628,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
cpu_parse_early_param();
if (this_cpu->c_early_init)
@@ -1615,6 +1642,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -1761,15 +1789,15 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_cpu_address_sizes(c);
if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+ c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
# ifdef CONFIG_SMP
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
# else
- c->apicid = c->initial_apicid;
+ c->topo.apicid = c->topo.initial_apicid;
# endif
#endif
- c->phys_proc_id = c->initial_apicid;
+ c->topo.pkg_id = c->topo.initial_apicid;
}
get_model_name(c); /* Default name */
@@ -1799,18 +1827,19 @@ static void generic_identify(struct cpuinfo_x86 *c)
static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int apicid, cpu = smp_processor_id();
+ unsigned int cpu = smp_processor_id();
+ u32 apicid;
apicid = apic->cpu_present_to_apicid(cpu);
- if (apicid != c->apicid) {
+ if (apicid != c->topo.apicid) {
pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
- cpu, apicid, c->initial_apicid);
+ cpu, apicid, c->topo.initial_apicid);
}
- BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
- BUG_ON(topology_update_die_map(c->cpu_die_id, cpu));
+ BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu));
+ BUG_ON(topology_update_die_map(c->topo.die_id, cpu));
#else
- c->logical_proc_id = 0;
+ c->topo.logical_pkg_id = 0;
#endif
}
@@ -1829,7 +1858,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
c->x86_coreid_bits = 0;
- c->cu_id = 0xff;
+ c->topo.cu_id = 0xff;
+ c->topo.llc_id = BAD_APICID;
+ c->topo.l2c_id = BAD_APICID;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1855,9 +1886,16 @@ static void identify_cpu(struct cpuinfo_x86 *c)
apply_forced_caps(c);
#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
#endif
+
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
@@ -1989,6 +2027,7 @@ static __init void identify_boot_cpu(void)
setup_cr_pinning();
tsx_init();
+ tdx_init();
lkgs_init();
}
@@ -2074,24 +2113,24 @@ void syscall_init(void)
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-#ifdef CONFIG_IA32_EMULATION
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
- /*
- * This only works on Intel CPUs.
- * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
- * This does not cause SYSENTER to jump to the wrong location, because
- * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
- */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
- (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
- wrmsrl_cstar((unsigned long)ignore_sysret);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
+ if (ia32_enabled()) {
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
/*
* Flags to clear on syscall; clear as much as possible
@@ -2166,8 +2205,6 @@ static inline void setup_getcpu(int cpu)
}
#ifdef CONFIG_X86_64
-static inline void ucode_cpu_init(int cpu) { }
-
static inline void tss_setup_ist(struct tss_struct *tss)
{
/* Set up the per-CPU TSS IST stacks */
@@ -2178,16 +2215,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)
/* Only mapped when SEV-ES is active */
tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
}
-
#else /* CONFIG_X86_64 */
-
-static inline void ucode_cpu_init(int cpu)
-{
- show_ucode_info_early();
-}
-
static inline void tss_setup_ist(struct tss_struct *tss) { }
-
#endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -2243,8 +2272,6 @@ void cpu_init(void)
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- ucode_cpu_init(cpu);
-
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1dcd7d4e3..885281ae7 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
unsigned int aperfmperf_get_khz(int cpu);
void cpu_select_mitigations(void);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index e462c1d38..6fb6d8a57 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_F16C, X86_FEATURE_XMM2, },
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
{ X86_FEATURE_AVX2, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
@@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000..0c179d684
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: %x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "max_cores: %u\n", c->x86_max_cores);
+ seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package);
+ seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index a7b3ef4c4..f0cd95502 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c)
*/
static void hygon_get_topology(struct cpuinfo_x86 *c)
{
- int cpu = smp_processor_id();
-
/* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int err;
@@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- c->cpu_die_id = ecx & 0xff;
+ c->topo.die_id = ecx & 0xff;
- c->cpu_core_id = ebx & 0xff;
+ c->topo.core_id = ebx & 0xff;
if (smp_num_siblings > 1)
c->x86_max_cores /= smp_num_siblings;
@@ -92,16 +90,15 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
* when running on host.
*/
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3)
- c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
+ c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT;
- cacheinfo_hygon_init_llc_id(c, cpu);
+ cacheinfo_hygon_init_llc_id(c);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ c->topo.die_id = value & 7;
+ c->topo.llc_id = c->topo.die_id;
} else
return;
@@ -116,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
static void hygon_detect_cmp(struct cpuinfo_x86 *c)
{
unsigned int bits;
- int cpu = smp_processor_id();
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
/* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+ c->topo.pkg_id = c->topo.initial_apicid >> bits;
+ /* Use package ID also for last level cache */
+ c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
}
static void srat_detect_node(struct cpuinfo_x86 *c)
@@ -132,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned int apicid = c->apicid;
+ unsigned int apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = c->topo.llc_id;
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -165,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -294,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
static void init_hygon(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_hygon(c);
/*
@@ -305,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = read_apic_id();
+ c->topo.apicid = read_apic_id();
/*
* XXX someone from Hygon needs to confirm this DTRT
@@ -324,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c)
init_hygon_cacheinfo(c);
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
if (cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
@@ -348,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be4045628..40dec9b56 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -184,6 +184,90 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
return false;
}
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
+#define TME_ACTIVATE_POLICY_AES_XTS_128 0
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
+#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
+
+/* Values for mktme_status (SW only construct) */
+#define MKTME_ENABLED 0
+#define MKTME_DISABLED 1
+#define MKTME_UNINITIALIZED 2
+static int mktme_status = MKTME_UNINITIALIZED;
+
+static void detect_tme_early(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate, tme_policy, tme_crypto_algs;
+ int keyid_bits = 0, nr_keyids = 0;
+ static u64 tme_activate_cpu0 = 0;
+
+ rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (mktme_status != MKTME_UNINITIALIZED) {
+ if (tme_activate != tme_activate_cpu0) {
+ /* Broken BIOS? */
+ pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
+ pr_err_once("x86/tme: MKTME is not usable\n");
+ mktme_status = MKTME_DISABLED;
+
+ /* Proceed. We may need to exclude bits from x86_phys_bits. */
+ }
+ } else {
+ tme_activate_cpu0 = tme_activate;
+ }
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ mktme_status = MKTME_DISABLED;
+ return;
+ }
+
+ if (mktme_status != MKTME_UNINITIALIZED)
+ goto detect_keyid_bits;
+
+ pr_info("x86/tme: enabled by BIOS\n");
+
+ tme_policy = TME_ACTIVATE_POLICY(tme_activate);
+ if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
+ pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+
+ tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
+ if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
+ pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
+ tme_crypto_algs);
+ mktme_status = MKTME_DISABLED;
+ }
+detect_keyid_bits:
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ nr_keyids = (1UL << keyid_bits) - 1;
+ if (nr_keyids) {
+ pr_info_once("x86/mktme: enabled by BIOS\n");
+ pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
+ } else {
+ pr_info_once("x86/mktme: disabled by BIOS\n");
+ }
+
+ if (mktme_status == MKTME_UNINITIALIZED) {
+ /* MKTME is usable */
+ mktme_status = MKTME_ENABLED;
+ }
+
+ /*
+ * KeyID bits effectively lower the number of physical address
+ * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+}
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -314,19 +398,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
check_memory_type_self_snoop_errata(c);
/*
@@ -335,6 +406,13 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*/
if (detect_extended_topology_early(c) < 0)
detect_ht_early(c);
+
+ /*
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
+ */
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
}
static void bsp_init_intel(struct cpuinfo_x86 *c)
@@ -495,90 +573,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-#define MSR_IA32_TME_ACTIVATE 0x982
-
-/* Helpers to access TME_ACTIVATE MSR */
-#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
-#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
-
-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
-#define TME_ACTIVATE_POLICY_AES_XTS_128 0
-
-#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
-
-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
-
-/* Values for mktme_status (SW only construct) */
-#define MKTME_ENABLED 0
-#define MKTME_DISABLED 1
-#define MKTME_UNINITIALIZED 2
-static int mktme_status = MKTME_UNINITIALIZED;
-
-static void detect_tme(struct cpuinfo_x86 *c)
-{
- u64 tme_activate, tme_policy, tme_crypto_algs;
- int keyid_bits = 0, nr_keyids = 0;
- static u64 tme_activate_cpu0 = 0;
-
- rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
-
- if (mktme_status != MKTME_UNINITIALIZED) {
- if (tme_activate != tme_activate_cpu0) {
- /* Broken BIOS? */
- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
- pr_err_once("x86/tme: MKTME is not usable\n");
- mktme_status = MKTME_DISABLED;
-
- /* Proceed. We may need to exclude bits from x86_phys_bits. */
- }
- } else {
- tme_activate_cpu0 = tme_activate;
- }
-
- if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
- pr_info_once("x86/tme: not enabled by BIOS\n");
- mktme_status = MKTME_DISABLED;
- return;
- }
-
- if (mktme_status != MKTME_UNINITIALIZED)
- goto detect_keyid_bits;
-
- pr_info("x86/tme: enabled by BIOS\n");
-
- tme_policy = TME_ACTIVATE_POLICY(tme_activate);
- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
-
- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
- tme_crypto_algs);
- mktme_status = MKTME_DISABLED;
- }
-detect_keyid_bits:
- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
- nr_keyids = (1UL << keyid_bits) - 1;
- if (nr_keyids) {
- pr_info_once("x86/mktme: enabled by BIOS\n");
- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
- } else {
- pr_info_once("x86/mktme: disabled by BIOS\n");
- }
-
- if (mktme_status == MKTME_UNINITIALIZED) {
- /* MKTME is usable */
- mktme_status = MKTME_ENABLED;
- }
-
- /*
- * KeyID bits effectively lower the number of physical address
- * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
- */
- c->x86_phys_bits -= keyid_bits;
-}
-
static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
@@ -715,9 +709,6 @@ static void init_intel(struct cpuinfo_x86 *c)
init_ia32_feat_ctl(c);
- if (cpu_has(c, X86_FEATURE_TME))
- detect_tme(c);
-
init_intel_misc_features(c);
split_lock_init();
@@ -1016,7 +1007,6 @@ static struct ctl_table sld_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static int __init sld_mitigate_sysctl_init(void)
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index e4c3ba913..f18d35fe2 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -237,4 +237,4 @@ err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
-subsys_initcall(intel_epb_init);
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c267f43de..2b46eb0fd 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -87,42 +87,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
- [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
- [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
- [SMCA_NBIF] = { "nbif", "NBIF Unit" },
- [SMCA_SHUB] = { "shub", "System Hub Unit" },
- [SMCA_SATA] = { "sata", "SATA Unit" },
- [SMCA_USB] = { "usb", "USB Unit" },
- [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
- [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
- [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
- [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -130,17 +128,8 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
- return smca_names[t].name;
-}
-
-const char *smca_get_long_name(enum smca_bank_types t)
-{
- if (t >= N_SMCA_BANK_TYPES)
- return NULL;
-
- return smca_names[t].long_name;
+ return smca_names[t];
}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
@@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
@@ -713,17 +705,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-bool amd_mce_is_memory_error(struct mce *m)
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
{
enum smca_bank_types bank_type;
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
+
+ if (XEC(m->status, 0x3f))
+ return false;
bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
if (mce_flags.smca)
- return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
+ * a system physical address. Therefore, individual cases need to be detected.
+ * Future cases and checks will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
+
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
- return m->bank == 4 && xec == 0x8;
+ /* Assume address is not usable for all others. */
+ return false;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 8ed341714..7f7309ff6 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
m.socketid = -1;
for_each_possible_cpu(cpu) {
- if (cpu_data(cpu).initial_apicid == lapic_id) {
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).phys_proc_id;
+ m.socketid = cpu_data(m.extcpu).topo.pkg_id;
break;
}
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6f35f724c..c17f9403c 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,6 +44,7 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -52,6 +53,7 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/tdx.h>
#include "internal.h"
@@ -123,8 +125,8 @@ void mce_setup(struct mce *m)
m->time = __ktime_get_real_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
- m->apicid = cpu_data(m->extcpu).initial_apicid;
+ m->socketid = cpu_data(m->extcpu).topo.pkg_id;
+ m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
m->ppin = cpu_data(m->extcpu).ppin;
m->microcode = boot_cpu_data.microcode;
@@ -228,11 +230,20 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ const char *memmsg;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -283,9 +294,29 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(final);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -453,32 +484,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
mce_schedule_work();
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granularity for now.
- */
-int mce_usable_address(struct mce *m)
+bool mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_ADDRV))
- return 0;
-
- /* Checks after this one are Intel/Zhaoxin-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
- boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 1;
-
- if (!(m->status & MCI_STATUS_MISCV))
- return 0;
+ return false;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
- return 1;
+ default:
+ return true;
+ }
}
EXPORT_SYMBOL_GPL(mce_usable_address);
@@ -680,6 +701,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(&m);
+
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -1611,13 +1642,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
@@ -1647,15 +1671,9 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
- if (mce_available(this_cpu_ptr(&cpu_info))) {
+ if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
- if (mce_intel_cmci_poll()) {
- iv = mce_adjust_timer(iv);
- goto done;
- }
- }
-
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
@@ -1665,23 +1683,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-done:
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ mce_set_storm_mode(storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -2005,7 +2029,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
- mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -2018,7 +2041,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
@@ -2452,12 +2474,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
-
if (!b->init)
return -ENODEV;
b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
return size;
}
@@ -2578,9 +2602,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
@@ -2675,8 +2696,6 @@ static void mce_reenable_cpu(void)
static int mce_cpu_dead(unsigned int cpu)
{
- mce_intel_hcpu_update(cpu);
-
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f5323551c..399b62e22 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -42,15 +42,6 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -63,22 +54,26 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
*/
static DEFINE_SPINLOCK(cmci_poll_lock);
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
-static atomic_t cmci_storm_on_cpus;
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
static int cmci_supported(int *banks)
{
@@ -134,204 +129,166 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
+ unsigned long flags;
+ u64 val;
- return true;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-void mce_intel_hcpu_update(unsigned long cpu)
+void mce_intel_handle_storm(int bank, bool on)
{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
-static void cmci_toggle_interrupt_mode(bool on)
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
{
- unsigned long flags, *owned;
- int bank;
- u64 val;
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ if (test_bit(bank, owned))
+ return true;
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
}
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ return false;
+}
- fallthrough;
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
- case CMCI_STORM_SUBSIDED:
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
*/
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
}
+
+ return val;
}
-static bool cmci_storm_detect(void)
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
}
- __this_cpu_write(cmci_storm_cnt, cnt);
-
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ int bios_wrong_thresh = 0;
unsigned long flags;
int i;
- int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
- if (test_bit(i, owned))
- continue;
-
- /* Skip banks in firmware first mode */
- if (test_bit(i, mce_banks_ce_disabled))
+ if (cmci_skip_bank(i, &val))
continue;
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
- }
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
@@ -370,6 +327,9 @@ static void __cmci_disable_bank(int bank)
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*
@@ -536,3 +496,23 @@ bool intel_filter_mce(struct mce *m)
return false;
}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index bcf1b3c66..01f8f0396 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -41,26 +41,80 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
+void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
@@ -210,6 +264,7 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
/*
* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -237,6 +292,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
static inline void smca_extract_err_addr(struct mce *m) { }
#endif
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index ef4e7bb5f..89e31e1e5 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
apic_eoi();
}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bbd1dc38e..620f0af71 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -37,6 +37,16 @@
#include "internal.h"
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
#define UCODE_MAGIC 0x00414d44
#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
#define UCODE_UCODE_TYPE 0x00000001
@@ -94,8 +104,6 @@ struct cont_desc {
size_t size;
};
-static u32 ucode_new_rev;
-
/*
* Microcode patch container file is prepended to the initrd in cpio
* format. See Documentation/arch/x86/microcode.rst
@@ -121,24 +129,20 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
/*
* Check whether there is a valid microcode container file at the beginning
- * of @buf of size @buf_size. Set @early to use this function in the early path.
+ * of @buf of size @buf_size.
*/
-static bool verify_container(const u8 *buf, size_t buf_size, bool early)
+static bool verify_container(const u8 *buf, size_t buf_size)
{
u32 cont_magic;
if (buf_size <= CONTAINER_HDR_SZ) {
- if (!early)
- pr_debug("Truncated microcode container header.\n");
-
+ pr_debug("Truncated microcode container header.\n");
return false;
}
cont_magic = *(const u32 *)buf;
if (cont_magic != UCODE_MAGIC) {
- if (!early)
- pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
-
+ pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
return false;
}
@@ -147,23 +151,20 @@ static bool verify_container(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated CPU equivalence table at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*/
-static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
{
const u32 *hdr = (const u32 *)buf;
u32 cont_type, equiv_tbl_len;
- if (!verify_container(buf, buf_size, early))
+ if (!verify_container(buf, buf_size))
return false;
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
- if (!early)
- pr_debug("Wrong microcode container equivalence table type: %u.\n",
- cont_type);
-
+ pr_debug("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
return false;
}
@@ -172,9 +173,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
equiv_tbl_len = hdr[2];
if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
buf_size < equiv_tbl_len) {
- if (!early)
- pr_debug("Truncated equivalence table.\n");
-
+ pr_debug("Truncated equivalence table.\n");
return false;
}
@@ -183,22 +182,19 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated microcode patch section at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early)
+__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
if (buf_size < SECTION_HDR_SIZE) {
- if (!early)
- pr_debug("Truncated patch section.\n");
-
+ pr_debug("Truncated patch section.\n");
return false;
}
@@ -207,17 +203,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early
p_size = hdr[1];
if (p_type != UCODE_UCODE_TYPE) {
- if (!early)
- pr_debug("Invalid type field (0x%x) in container file section header.\n",
- p_type);
-
+ pr_debug("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
return false;
}
if (p_size < sizeof(struct microcode_header_amd)) {
- if (!early)
- pr_debug("Patch of size %u too short.\n", p_size);
-
+ pr_debug("Patch of size %u too short.\n", p_size);
return false;
}
@@ -269,7 +261,7 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* 0: success
*/
static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early)
+verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
{
struct microcode_header_amd *mc_hdr;
unsigned int ret;
@@ -277,7 +269,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
u16 proc_id;
u8 patch_fam;
- if (!__verify_patch_section(buf, buf_size, &sh_psize, early))
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
return -1;
/*
@@ -292,16 +284,13 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
* size sh_psize, as the section claims.
*/
if (buf_size < sh_psize) {
- if (!early)
- pr_debug("Patch of size %u truncated.\n", sh_psize);
-
+ pr_debug("Patch of size %u truncated.\n", sh_psize);
return -1;
}
ret = __verify_patch_size(family, sh_psize, buf_size);
if (!ret) {
- if (!early)
- pr_debug("Per-family patch size mismatch.\n");
+ pr_debug("Per-family patch size mismatch.\n");
return -1;
}
@@ -309,8 +298,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- if (!early)
- pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
return -1;
}
@@ -337,7 +325,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u16 eq_id;
u8 *buf;
- if (!verify_equivalence_table(ucode, size, true))
+ if (!verify_equivalence_table(ucode, size))
return 0;
buf = ucode;
@@ -364,7 +352,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true);
+ ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -452,19 +440,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
+static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
struct microcode_amd *mc;
- u32 rev, dummy, *new_rev;
bool ret = false;
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-#else
- new_rev = &ucode_new_rev;
-#endif
-
desc.cpuid_1_eax = cpuid_1_eax;
scan_containers(ucode, size, &desc);
@@ -473,25 +454,18 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
if (!mc)
return ret;
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/*
* Allow application of the same revision to pick up SMT-specific
* changes even if the revision of the other SMT thread is already
* up-to-date.
*/
- if (rev > mc->hdr.patch_id)
+ if (old_rev > mc->hdr.patch_id)
return ret;
- if (!__apply_microcode_amd(mc)) {
- *new_rev = mc->hdr.patch_id;
- ret = true;
- }
-
- return ret;
+ return !__apply_microcode_amd(mc);
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct firmware fw;
@@ -501,7 +475,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
- "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
if (firmware_request_builtin(&fw, fw_name)) {
cp->size = fw.size;
@@ -512,57 +486,48 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
- struct ucode_cpu_info *uci;
struct cpio_data cp;
- const char *path;
- bool use_pa;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- uci = ucode_cpu_info;
- path = ucode_path;
- use_pa = false;
- }
if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
- cp = find_microcode_in_initrd(path, use_pa);
-
- /* Needed in load_microcode_amd() */
- uci->cpu_sig.sig = cpuid_1_eax;
+ cp = find_microcode_in_initrd(ucode_path);
*ret = cp;
}
-static void apply_ucode_from_containers(unsigned int cpuid_1_eax)
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
+ u32 dummy;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
+
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- early_apply_microcode(cpuid_1_eax, cp.data, cp.size);
-}
-
-void load_ucode_amd_early(unsigned int cpuid_1_eax)
-{
- return apply_ucode_from_containers(cpuid_1_eax);
+ if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
}
static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
-int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
+static int __init save_microcode_in_initrd(void)
{
+ unsigned int cpuid_1_eax = native_cpuid_eax(1);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
struct cont_desc desc = { 0 };
enum ucode_state ret;
struct cpio_data cp;
- cp = find_microcode_in_initrd(ucode_path, false);
+ if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return -EINVAL;
@@ -578,6 +543,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
return 0;
}
+early_initcall(save_microcode_in_initrd);
/*
* a small, trivial cache of per-family ucode patches
@@ -631,7 +597,6 @@ static struct ucode_patch *find_patch(unsigned int cpu)
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u16 equiv_id;
-
equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
if (!equiv_id)
return NULL;
@@ -654,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu)
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
- }
+ if (!__apply_microcode_amd(mc))
+ pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
}
}
@@ -678,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
- pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
return 0;
}
@@ -720,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu)
rev = mc_amd->hdr.patch_id;
ret = UCODE_UPDATED;
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
-
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
@@ -733,12 +692,20 @@ out:
return ret;
}
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
{
u32 equiv_tbl_len;
const u32 *hdr;
- if (!verify_equivalence_table(buf, buf_size, false))
+ if (!verify_equivalence_table(buf, buf_size))
return 0;
hdr = (const u32 *)buf;
@@ -784,7 +751,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size, false);
+ ret = verify_patch(family, fw, leftover, patch_size);
if (ret)
return ret;
@@ -909,6 +876,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
+ if (force_minrev)
+ return UCODE_NFOUND;
+
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -918,7 +888,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
}
ret = UCODE_ERROR;
- if (!verify_container(fw->data, fw->size, false))
+ if (!verify_container(fw->data, fw->size))
goto fw_release;
ret = load_microcode_amd(c->x86, fw->data, fw->size);
@@ -938,10 +908,11 @@ static void microcode_fini_cpu_amd(int cpu)
}
static struct microcode_ops microcode_amd_ops = {
- .request_microcode_fw = request_microcode_amd,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .nmi_safe = true,
};
struct microcode_ops * __init init_amd_microcode(void)
@@ -952,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void)
pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
-
- if (ucode_new_rev)
- pr_info_once("microcode updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
return &microcode_amd_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index a4ebd5e0a..232026a23 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -23,6 +23,7 @@
#include <linux/miscdevice.h>
#include <linux/capability.h>
#include <linux/firmware.h>
+#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/mutex.h>
@@ -31,6 +32,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
#include <asm/processor.h>
@@ -39,14 +41,11 @@
#include "internal.h"
-#define DRIVER_VERSION "2.2"
-
static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr = true;
-
-bool initrd_gone;
+bool dis_ucode_ldr = true;
-LIST_HEAD(microcode_cache);
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
/*
* Synchronization.
@@ -76,6 +75,8 @@ static u32 final_levels[] = {
0, /* T-101 terminator */
};
+struct early_load_data early_data;
+
/*
* Check the current patch level on this CPU.
*
@@ -90,10 +91,7 @@ static bool amd_check_current_patch_level(void)
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
- if (IS_ENABLED(CONFIG_X86_32))
- levels = (u32 *)__pa_nodebug(&final_levels);
- else
- levels = final_levels;
+ levels = final_levels;
for (i = 0; levels[i]; i++) {
if (lvl == levels[i])
@@ -105,17 +103,8 @@ static bool amd_check_current_patch_level(void)
static bool __init check_loader_disabled_bsp(void)
{
static const char *__dis_opt_str = "dis_ucode_ldr";
-
-#ifdef CONFIG_X86_32
- const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *option = (const char *)__pa_nodebug(__dis_opt_str);
- bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
const char *cmdline = boot_command_line;
const char *option = __dis_opt_str;
- bool *res = &dis_ucode_ldr;
-#endif
/*
* CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
@@ -123,17 +112,17 @@ static bool __init check_loader_disabled_bsp(void)
* that's good enough as they don't land on the BSP path anyway.
*/
if (native_cpuid_ecx(1) & BIT(31))
- return *res;
+ return true;
if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
if (amd_check_current_patch_level())
- return *res;
+ return true;
}
if (cmdline_find_option_bool(cmdline, option) <= 0)
- *res = false;
+ dis_ucode_ldr = false;
- return *res;
+ return dis_ucode_ldr;
}
void __init load_ucode_bsp(void)
@@ -166,25 +155,16 @@ void __init load_ucode_bsp(void)
return;
if (intel)
- load_ucode_intel_bsp();
+ load_ucode_intel_bsp(&early_data);
else
- load_ucode_amd_early(cpuid_1_eax);
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
- return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
- return dis_ucode_ldr;
-#endif
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
}
void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
- if (check_loader_disabled_ap())
+ if (dis_ucode_ldr)
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -196,103 +176,44 @@ void load_ucode_ap(void)
break;
case X86_VENDOR_AMD:
if (x86_family(cpuid_1_eax) >= 0x10)
- load_ucode_amd_early(cpuid_1_eax);
+ load_ucode_amd_ap(cpuid_1_eax);
break;
default:
break;
}
}
-static int __init save_microcode_in_initrd(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- int ret = -EINVAL;
-
- if (dis_ucode_ldr) {
- ret = 0;
- goto out;
- }
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- if (c->x86 >= 6)
- ret = save_microcode_in_initrd_intel();
- break;
- case X86_VENDOR_AMD:
- if (c->x86 >= 0x10)
- ret = save_microcode_in_initrd_amd(cpuid_eax(1));
- break;
- default:
- break;
- }
-
-out:
- initrd_gone = true;
-
- return ret;
-}
-
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+struct cpio_data __init find_microcode_in_initrd(const char *path)
{
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long start = 0;
size_t size;
#ifdef CONFIG_X86_32
- struct boot_params *params;
-
- if (use_pa)
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- else
- params = &boot_params;
-
- size = params->hdr.ramdisk_size;
-
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
if (size)
- start = params->hdr.ramdisk_image;
+ start = initrd_start_early;
-# else /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
size = (unsigned long)boot_params.ext_ramdisk_size << 32;
size |= boot_params.hdr.ramdisk_size;
if (size) {
start = (unsigned long)boot_params.ext_ramdisk_image << 32;
start |= boot_params.hdr.ramdisk_image;
-
start += PAGE_OFFSET;
}
-# endif
+#endif
/*
* Fixup the start address: after reserve_initrd() runs, initrd_start
* has the virtual address of the beginning of the initrd. It also
* possibly relocates the ramdisk. In either case, initrd_start contains
* the updated address so use that instead.
- *
- * initrd_gone is for the hotplug case where we've thrown out initrd
- * already.
*/
- if (!use_pa) {
- if (initrd_gone)
- return (struct cpio_data){ NULL, 0, "" };
- if (initrd_start)
- start = initrd_start;
- } else {
- /*
- * The picture with physical addresses is a bit different: we
- * need to get the *physical* address to which the ramdisk was
- * relocated, i.e., relocated_ramdisk (not initrd_start) and
- * since we're running from physical addresses, we need to access
- * relocated_ramdisk through its *physical* address too.
- */
- u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
- if (*rr)
- start = *rr;
- }
+ if (initrd_start)
+ start = initrd_start;
return find_cpio_data(path, (void *)start, size, NULL);
#else /* !CONFIG_BLK_DEV_INITRD */
@@ -336,117 +257,298 @@ static struct platform_device *microcode_pdev;
* requirement can be relaxed in the future. Right now, this is conservative
* and good.
*/
-#define SPINUNIT 100 /* 100 nsec */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
+
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
-static int check_online_cpus(void)
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
{
- unsigned int cpu;
+ unsigned int timeout, loops;
- /*
- * Make sure all CPUs are online. It's fine for SMT to be disabled if
- * all the primary threads are still online.
- */
- for_each_present_cpu(cpu) {
- if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
- pr_err("Not all CPUs online, aborting microcode update.\n");
- return -EINVAL;
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
}
}
-
- return 0;
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
}
-static atomic_t late_cpus_in;
-static atomic_t late_cpus_out;
-
-static int __wait_for_cpus(atomic_t *t, long long timeout)
+static noinstr bool wait_for_ctrl(void)
{
- int all_cpus = num_online_cpus();
+ unsigned int timeout, loops;
- atomic_inc(t);
-
- while (atomic_read(t) < all_cpus) {
- if (timeout < SPINUNIT) {
- pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
- all_cpus - atomic_read(t));
- return 1;
- }
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
- ndelay(SPINUNIT);
- timeout -= SPINUNIT;
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
- touch_nmi_watchdog();
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
}
- return 0;
+ return false;
}
/*
- * Returns:
- * < 0 - on error
- * 0 - success (no update done or microcode was updated)
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
*/
-static int __reload_late(void *info)
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
{
- int cpu = smp_processor_id();
- enum ucode_state err;
- int ret = 0;
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
/*
- * Wait for all CPUs to arrive. A load will not be attempted unless all
- * CPUs show up.
- * */
- if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
- return -1;
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
+{
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
+ }
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
/*
- * On an SMT system, it suffices to load the microcode on one sibling of
- * the core because the microcode engine is shared between the threads.
- * Synchronization still needs to take place so that no concurrent
- * loading attempts happen on multiple threads of an SMT core. See
- * below.
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
*/
- if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
- err = microcode_ops->apply_microcode(cpu);
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
else
- goto wait_for_siblings;
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
- if (err >= UCODE_NFOUND) {
- if (err == UCODE_ERROR) {
- pr_warn("Error reloading microcode on CPU %d\n", cpu);
- ret = -1;
- }
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
+
+static void __load_primary(unsigned int cpu)
+{
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
}
-wait_for_siblings:
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
- panic("Timeout during microcode update!\n");
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
/*
- * At least one thread has completed update on each core.
- * For others, simply call the update to make sure the
- * per-cpu cpuinfo can be updated with right microcode
- * revision.
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
*/
- if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
- err = microcode_ops->apply_microcode(cpu);
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
+ }
+}
+
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
- return ret;
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
}
/*
- * Reload microcode late on all CPUs. Wait for a sec until they
- * all gather together.
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
*/
-static int microcode_reload_late(void)
+void noinstr microcode_offline_nmi_handler(void)
{
- int old = boot_cpu_data.microcode, ret;
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
+
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
+ }
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
+}
+
+/*
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
+ */
+bool noinstr microcode_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
+
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
struct cpuinfo_x86 prev_info;
- pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
- pr_err("You should switch to early loading, if possible.\n");
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
- atomic_set(&late_cpus_in, 0);
- atomic_set(&late_cpus_out, 0);
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
/*
* Take a snapshot before the microcode update in order to compare and
@@ -454,52 +556,162 @@ static int microcode_reload_late(void)
*/
store_cpu_caps(&prev_info);
- ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (!ret) {
- pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n",
- old, boot_cpu_data.microcode);
- microcode_check(&prev_info);
- } else {
- pr_info("Reload failed, current microcode revision: 0x%x\n",
- boot_cpu_data.microcode);
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
+
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
+
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
+ }
+
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
+
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
+
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
+}
+
+/*
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
+ */
+static bool setup_cpus(void)
+{
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
+
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
+
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
}
+ return true;
+}
- return ret;
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ default:
+ return -EBADFD;
+ }
}
static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
- enum ucode_state tmp_ret = UCODE_OK;
- int bsp = boot_cpu_data.cpu_index;
unsigned long val;
- ssize_t ret = 0;
+ ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret || val != 1)
return -EINVAL;
cpus_read_lock();
-
- ret = check_online_cpus();
- if (ret)
- goto put;
-
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev);
- if (tmp_ret != UCODE_NEW)
- goto put;
-
- ret = microcode_reload_late();
-put:
+ ret = load_late_locked();
cpus_read_unlock();
- if (ret == 0)
- ret = size;
-
- add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
-
- return ret;
+ return ret ? : size;
}
static DEVICE_ATTR_WO(reload);
@@ -541,17 +753,6 @@ static void microcode_fini_cpu(int cpu)
microcode_ops->microcode_fini_cpu(cpu);
}
-static enum ucode_state microcode_init_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- memset(uci, 0, sizeof(*uci));
-
- microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
-
- return microcode_ops->apply_microcode(cpu);
-}
-
/**
* microcode_bsp_resume - Update boot CPU microcode during resume.
*/
@@ -570,19 +771,18 @@ static struct syscore_ops mc_syscore_ops = {
.resume = microcode_bsp_resume,
};
-static int mc_cpu_starting(unsigned int cpu)
-{
- enum ucode_state err = microcode_ops->apply_microcode(cpu);
-
- pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err);
-
- return err == UCODE_ERROR;
-}
-
static int mc_cpu_online(unsigned int cpu)
{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct device *dev = get_cpu_device(cpu);
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
return 0;
@@ -590,33 +790,13 @@ static int mc_cpu_online(unsigned int cpu)
static int mc_cpu_down_prep(unsigned int cpu)
{
- struct device *dev;
-
- dev = get_cpu_device(cpu);
+ struct device *dev = get_cpu_device(cpu);
microcode_fini_cpu(cpu);
-
- /* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("%s: CPU%d\n", __func__, cpu);
-
return 0;
}
-static void setup_online_cpu(struct work_struct *work)
-{
- int cpu = smp_processor_id();
- enum ucode_state err;
-
- err = microcode_init_cpu(cpu);
- if (err == UCODE_ERROR) {
- pr_err("Error applying microcode on CPU%d\n", cpu);
- return;
- }
-
- mc_cpu_online(cpu);
-}
-
static struct attribute *cpu_root_microcode_attrs[] = {
#ifdef CONFIG_MICROCODE_LATE_LOADING
&dev_attr_reload.attr,
@@ -648,6 +828,11 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
@@ -662,16 +847,9 @@ static int __init microcode_init(void)
}
}
- /* Do per-CPU setup */
- schedule_on_each_cpu(setup_online_cpu);
-
register_syscore_ops(&mc_syscore_ops);
- cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
- mc_cpu_starting, NULL);
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
- mc_cpu_online, mc_cpu_down_prep);
-
- pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
return 0;
@@ -680,5 +858,4 @@ static int __init microcode_init(void)
return error;
}
-fs_initcall(save_microcode_in_initrd);
late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 94dd6af9c..857e608af 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -14,7 +14,6 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -32,11 +31,14 @@
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
/* Current microcode patch used in early patching on the APs. */
-static struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
/* last level cache size per core */
-static int llc_size_per_core;
+static unsigned int llc_size_per_core __ro_after_init;
/* microcode format is extended from prescott processors */
struct extended_signature {
@@ -66,60 +68,52 @@ static inline unsigned int exttable_size(struct extended_sigtable *et)
return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
}
-int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+void intel_collect_cpu_info(struct cpu_signature *sig)
{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
- family = x86_family(eax);
- model = x86_model(eax);
+ if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ unsigned int val[2];
- if (model >= 5 || family > 6) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
}
+}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
+{
+ if (s1->sig != sig2)
+ return false;
- return 0;
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
}
-EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
{
struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
int i;
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
/* Look for ext. headers: */
if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
+ return false;
ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
ext_sig++;
}
return 0;
@@ -240,264 +234,91 @@ int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
}
EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
+static void update_ucode_pointer(struct microcode_intel *mc)
{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return intel_find_matching_signature(mc, csig, cpf);
-}
-
-static struct ucode_patch *memdup_patch(void *data, unsigned int size)
-{
- struct ucode_patch *p;
-
- p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
- if (!p)
- return NULL;
-
- p->data = kmemdup(data, size, GFP_KERNEL);
- if (!p->data) {
- kfree(p);
- return NULL;
- }
-
- return p;
-}
-
-static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
-{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p = NULL;
- bool prev_found = false;
- unsigned int sig, pf;
-
- mc_hdr = (struct microcode_header_intel *)data;
-
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- mc_saved_hdr = (struct microcode_header_intel *)iter->data;
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
-
- if (intel_find_matching_signature(data, sig, pf)) {
- prev_found = true;
-
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
-
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer %p\n", data);
- else {
- list_replace(&iter->plist, &p->plist);
- kfree(iter->data);
- kfree(iter);
- }
- }
- }
+ kvfree(ucode_patch_va);
/*
- * There weren't any previous patches found in the list cache; save the
- * newly found.
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
*/
- if (!prev_found) {
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer for %p\n", data);
- else
- list_add_tail(&p->plist, &microcode_cache);
- }
-
- if (!p)
- return;
+ ucode_patch_va = mc;
+}
- if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
- return;
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
- /*
- * Save for early loading. On 32-bit, that needs to be a physical
- * address as the APs are running from physical addresses, before
- * paging has been enabled.
- */
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
else
- intel_ucode_patch = p->data;
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
}
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static struct microcode_intel *
-scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
{
struct microcode_header_intel *mc_header;
struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
unsigned int mc_size;
- while (size) {
- if (size < sizeof(struct microcode_header_intel))
- break;
-
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size ||
- mc_size > size ||
+ if (!mc_size || mc_size > size ||
intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
break;
- size -= mc_size;
-
- if (!intel_find_matching_signature(data, uci->cpu_sig.sig,
- uci->cpu_sig.pf)) {
- data += mc_size;
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
continue;
- }
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
if (save) {
- save_microcode_patch(uci, data, mc_size);
- goto next;
- }
-
-
- if (!patch) {
- if (!has_newer_microcode(data,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- uci->cpu_sig.rev))
- goto next;
-
- } else {
- struct microcode_header_intel *phdr = &patch->hdr;
-
- if (!has_newer_microcode(data,
- phdr->sig,
- phdr->pf,
- phdr->rev))
- goto next;
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
}
- /* We have a newer patch, save it. */
patch = data;
-
-next:
- data += mc_size;
- }
-
- if (size)
- return NULL;
-
- return patch;
-}
-
-static bool load_builtin_intel_microcode(struct cpio_data *cp)
-{
- unsigned int eax = 1, ebx, ecx = 0, edx;
- struct firmware fw;
- char name[30];
-
- if (IS_ENABLED(CONFIG_X86_32))
- return false;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x",
- x86_family(eax), x86_model(eax), x86_stepping(eax));
-
- if (firmware_request_builtin(&fw, name)) {
- cp->size = fw.size;
- cp->data = (void *)fw.data;
- return true;
+ cur_rev = mc_header->rev;
}
- return false;
+ return size ? NULL : patch;
}
-static void print_ucode_info(int old_rev, int new_rev, unsigned int date)
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
{
- pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n",
- old_rev,
- new_rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-static int early_old_rev;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
- struct ucode_cpu_info uci;
-
- if (delay_ucode_info) {
- intel_cpu_collect_info(&uci);
- print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date);
- delay_ucode_info = 0;
- }
-}
-
-/*
- * At this point, we can not call printk() yet. Delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(int old_rev, int new_rev, int date)
-{
- int *delay_ucode_info_p;
- int *current_mc_date_p;
- int *early_old_rev_p;
-
- delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
- current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
- early_old_rev_p = (int *)__pa_nodebug(&early_old_rev);
-
- *delay_ucode_info_p = 1;
- *current_mc_date_p = date;
- *early_old_rev_p = old_rev;
-}
-#else
-
-static inline void print_ucode(int old_rev, int new_rev, int date)
-{
- print_ucode_info(old_rev, new_rev, date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
- struct microcode_intel *mc;
- u32 rev, old_rev;
+ u32 rev;
- mc = uci->mc;
if (!mc)
- return 0;
+ return UCODE_NFOUND;
/*
* Save us the MSR write below - which is a particular expensive
* operation - when the other hyperthread has updated the microcode
* already.
*/
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- uci->cpu_sig.rev = rev;
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
return UCODE_OK;
}
- old_rev = rev;
-
/*
* Writeback and invalidate caches before updating microcode to avoid
* internal issues depending on what the microcode is updating.
@@ -509,247 +330,173 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
rev = intel_get_microcode_revision();
if (rev != mc->hdr.rev)
- return -1;
+ return UCODE_ERROR;
uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
+}
- if (early)
- print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date);
- else
- print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date);
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
- return 0;
+ return __apply_microcode(uci, mc, &cur_rev);
}
-int __init save_microcode_in_initrd_intel(void)
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
{
- struct ucode_cpu_info uci;
- struct cpio_data cp;
-
- /*
- * initrd is going away, clear patch ptr. We will scan the microcode one
- * last time before jettisoning and save a patch, if found. Then we will
- * update that pointer too, with a stable patch address to use when
- * resuming the cores.
- */
- intel_ucode_patch = NULL;
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
+ char name[30];
- if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(ucode_path, false);
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
- if (!(cp.data && cp.size))
- return 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
- intel_cpu_collect_info(&uci);
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
- scan_microcode(cp.data, cp.size, &uci, true);
- return 0;
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+ return false;
}
-/*
- * @res_patch, output: a pointer to the patch we found.
- */
-static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
{
- static const char *path;
struct cpio_data cp;
- bool use_pa;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- path = ucode_path;
- use_pa = false;
- }
- /* try built-in microcode first */
+ intel_collect_cpu_info(&uci->cpu_sig);
+
if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(path, use_pa);
+ cp = find_microcode_in_initrd(ucode_path);
if (!(cp.data && cp.size))
return NULL;
- intel_cpu_collect_info(uci);
-
- return scan_microcode(cp.data, cp.size, uci, false);
+ return scan_microcode(cp.data, cp.size, uci, save);
}
-void __init load_ucode_intel_bsp(void)
+/*
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
+ */
+static int __init save_builtin_microcode(void)
{
- struct microcode_intel *patch;
struct ucode_cpu_info uci;
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
- uci.mc = patch;
+ if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
- apply_microcode_early(&uci, true);
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
}
+early_initcall(save_builtin_microcode);
-void load_ucode_intel_ap(void)
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
- struct microcode_intel *patch, **iup;
struct ucode_cpu_info uci;
- if (IS_ENABLED(CONFIG_X86_32))
- iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
- else
- iup = &intel_ucode_patch;
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
- if (!*iup) {
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
-
- *iup = patch;
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
}
-
- uci.mc = *iup;
-
- apply_microcode_early(&uci, true);
}
-static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
+void load_ucode_intel_ap(void)
{
- struct microcode_header_intel *phdr;
- struct ucode_patch *iter, *tmp;
-
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
-
- phdr = (struct microcode_header_intel *)iter->data;
-
- if (phdr->rev <= uci->cpu_sig.rev)
- continue;
-
- if (!intel_find_matching_signature(phdr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf))
- continue;
+ struct ucode_cpu_info uci;
- return iter->data;
- }
- return NULL;
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
+/* Reload microcode on resume */
void reload_ucode_intel(void)
{
- struct microcode_intel *p;
- struct ucode_cpu_info uci;
-
- intel_cpu_collect_info(&uci);
-
- p = find_patch(&uci);
- if (!p)
- return;
-
- uci.mc = p;
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
- apply_microcode_early(&uci, false);
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned int val[2];
-
- memset(csig, 0, sizeof(*csig));
-
- csig->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig->rev = c->microcode;
-
+ intel_collect_cpu_info(csig);
return 0;
}
-static enum ucode_state apply_microcode_intel(int cpu)
+static enum ucode_state apply_microcode_late(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
- struct microcode_intel *mc;
+ struct microcode_intel *mc = ucode_patch_late;
enum ucode_state ret;
- static int prev_rev;
- u32 rev;
+ u32 cur_rev;
- /* We should bind the task to the CPU */
- if (WARN_ON(raw_smp_processor_id() != cpu))
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
return UCODE_ERROR;
- /* Look for a newer patch in our cache: */
- mc = find_patch(uci);
- if (!mc) {
- mc = uci->mc;
- if (!mc)
- return UCODE_NFOUND;
- }
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
/*
- * Save us the MSR write below - which is a particular expensive
- * operation - when the other hyperthread has updated the microcode
- * already.
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
*/
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- ret = UCODE_OK;
- goto out;
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
}
/*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
*/
- native_wbinvd();
-
- /* write microcode via MSR 0x79 */
- wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-
- rev = intel_get_microcode_revision();
-
- if (rev != mc->hdr.rev) {
- pr_err("CPU%d update to revision 0x%x failed\n",
- cpu, mc->hdr.rev);
- return UCODE_ERROR;
- }
-
- if (bsp && rev != prev_rev) {
- pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
- rev,
- mc->hdr.date & 0xffff,
- mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
- prev_rev = rev;
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
}
-
- ret = UCODE_UPDATED;
-
-out:
- uci->cpu_sig.rev = rev;
- c->microcode = rev;
-
- /* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (bsp)
- boot_cpu_data.microcode = rev;
-
- return ret;
+ return true;
}
-static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- unsigned int curr_mc_size = 0, new_mc_size = 0;
- enum ucode_state ret = UCODE_OK;
- int new_rev = uci->cpu_sig.rev;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
u8 *new_mc = NULL, *mc = NULL;
- unsigned int csig, cpf;
while (iov_iter_count(iter)) {
struct microcode_header_intel mc_header;
@@ -758,68 +505,66 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
pr_err("error! Truncated or inaccessible header in microcode data file\n");
- break;
+ goto fail;
}
mc_size = get_totalsize(&mc_header);
if (mc_size < sizeof(mc_header)) {
pr_err("error! Bad data in microcode data file (totalsize too small)\n");
- break;
+ goto fail;
}
data_size = mc_size - sizeof(mc_header);
if (data_size > iov_iter_count(iter)) {
pr_err("error! Bad data in microcode data file (truncated file?)\n");
- break;
+ goto fail;
}
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
- vfree(mc);
- mc = vmalloc(mc_size);
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
if (!mc)
- break;
+ goto fail;
curr_mc_size = mc_size;
}
memcpy(mc, &mc_header, sizeof(mc_header));
data = mc + sizeof(mc_header);
if (!copy_from_iter_full(data, data_size, iter) ||
- intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) {
- break;
- }
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
- csig = uci->cpu_sig.sig;
- cpf = uci->cpu_sig.pf;
- if (has_newer_microcode(mc, csig, cpf, new_rev)) {
- vfree(new_mc);
- new_rev = mc_header.rev;
- new_mc = mc;
- new_mc_size = mc_size;
- mc = NULL; /* trigger new vmalloc */
- ret = UCODE_NEW;
- }
- }
+ if (cur_rev >= mc_header.rev)
+ continue;
- vfree(mc);
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
- if (iov_iter_count(iter)) {
- vfree(new_mc);
- return UCODE_ERROR;
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
}
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
if (!new_mc)
return UCODE_NFOUND;
- vfree(uci->mc);
- uci->mc = (struct microcode_intel *)new_mc;
-
- /* Save for CPU hotplug */
- save_microcode_patch(uci, new_mc, new_mc_size);
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
- pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
-
- return ret;
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
}
static bool is_blacklisted(unsigned int cpu)
@@ -868,26 +613,36 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
kvec.iov_base = (void *)firmware->data;
kvec.iov_len = firmware->size;
iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
- ret = generic_load_microcode(cpu, &iter);
+ ret = parse_microcode_blobs(cpu, &iter);
release_firmware(firmware);
return ret;
}
+static void finalize_late_load(int result)
+{
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
+}
+
static struct microcode_ops microcode_intel_ops = {
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode_intel,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
};
-static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
u64 llc_size = c->x86_cache_size * 1024ULL;
do_div(llc_size, c->x86_max_cores);
-
- return (int)llc_size;
+ llc_size_per_core = (unsigned int)llc_size;
}
struct microcode_ops * __init init_intel_microcode(void)
@@ -900,7 +655,7 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
- llc_size_per_core = calc_llc_size_per_core(c);
+ calc_llc_size_per_core(c);
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index bf883aa71..21776c529 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -8,43 +8,43 @@
#include <asm/cpu.h>
#include <asm/microcode.h>
-struct ucode_patch {
- struct list_head plist;
- void *data; /* Intel uses only this one */
- unsigned int size;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-extern struct list_head microcode_cache;
-
struct device;
enum ucode_state {
UCODE_OK = 0,
UCODE_NEW,
+ UCODE_NEW_SAFE,
UCODE_UPDATED,
UCODE_NFOUND,
UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
};
struct microcode_ops {
enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
-
void (*microcode_fini_cpu)(int cpu);
/*
- * The generic 'microcode_core' part guarantees that
- * the callbacks below run on a target cpu when they
- * are being called.
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
* See also the "Synchronization" section in microcode_core.c.
*/
- enum ucode_state (*apply_microcode)(int cpu);
- int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ enum ucode_state (*apply_microcode)(int cpu);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
};
+extern struct early_load_data early_data;
extern struct ucode_cpu_info ucode_cpu_info[];
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
+struct cpio_data find_microcode_in_initrd(const char *path);
#define MAX_UCODE_COUNT 128
@@ -94,20 +94,19 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool initrd_gone;
+extern bool dis_ucode_ldr;
+extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
-void load_ucode_amd_bsp(unsigned int family);
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
-void load_ucode_amd_early(unsigned int cpuid_1_eax);
int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
-static inline void load_ucode_amd_bsp(unsigned int family) { }
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
-static inline void load_ucode_amd_early(unsigned int family) { }
static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
@@ -115,15 +114,13 @@ static inline void exit_amd_microcode(void) { }
#endif /* !CONFIG_CPU_SUP_AMD */
#ifdef CONFIG_CPU_SUP_INTEL
-void load_ucode_intel_bsp(void);
+void load_ucode_intel_bsp(struct early_load_data *ed);
void load_ucode_intel_ap(void);
-int save_microcode_in_initrd_intel(void);
void reload_ucode_intel(void);
struct microcode_ops *init_intel_microcode(void);
#else /* CONFIG_CPU_SUP_INTEL */
-static inline void load_ucode_intel_bsp(void) { }
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
static inline void load_ucode_intel_ap(void) { }
-static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; }
static inline void reload_ucode_intel(void) { }
static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
#endif /* !CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e6bba12c7..01fa06dd0 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void)
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
if (!unknown_nmi_panic)
return NMI_DONE;
- if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
return NMI_HANDLED;
return NMI_DONE;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2d6aa5d2e..d3524778a 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -428,6 +428,10 @@ void __init mtrr_copy_map(void)
* from the x86_init.hyper.init_platform() hook. It can be called only once.
* The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
* is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type)
@@ -492,13 +496,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
/**
* mtrr_type_lookup - look up memory type in MTRR
*
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
* Return Values:
* MTRR_TYPE_(type) - The effective MTRR type for the region
* MTRR_TYPE_INVALID - MTRR is disabled
- *
- * Output Argument:
- * uniform - Set to 1 when the returned MTRR type is valid for the whole
- * region, set to 0 else.
*/
u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 31c0e68f6..e65fae636 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
seq_printf(m, "siblings\t: %d\n",
cpumask_weight(topology_core_cpumask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 030d3b409..d04371e85 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void)
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
r->alloc_capable = true;
rdt_alloc_capable = true;
@@ -230,9 +231,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- union cpuid_0x10_3_eax eax;
- union cpuid_0x10_x_edx edx;
- u32 ebx, ecx, subleaf;
+ u32 eax, ebx, ecx, edx, subleaf;
/*
* Query CPUID_Fn80000020_EDX_x01 for MBA and
@@ -240,9 +239,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
*/
subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
- hw_res->num_closid = edx.split.cos_max + 1;
- r->default_ctrl = MAX_MBA_BW_AMD;
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->default_ctrl = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -267,15 +266,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx;
- cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
r->cache.shareable_bits = ebx & r->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
}
@@ -872,7 +874,6 @@ static __init void rdt_init_res_defs_intel(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
r->cache.min_cbm_bits = 1;
} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -892,7 +893,7 @@ static __init void rdt_init_res_defs_amd(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = true;
+ r->cache.arch_has_sparse_bitmasks = true;
r->cache.arch_has_per_cpu_cfg = true;
r->cache.min_cbm_bits = 0;
} else if (r->rid == RDT_RESOURCE_MBA) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b44c48772..beccb0e87 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
/*
* Check whether a cache bit mask is valid.
- * For Intel the SDM says:
- * Please note that all (and only) contiguous '1' combinations
- * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
+ * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
+ * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
+ * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
+ *
+ * Haswell does not support a non-contiguous 1s value and additionally
+ * requires at least two bits set.
* AMD allows non-contiguous bitmasks.
*/
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
@@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
first_bit = find_first_bit(&val, cbm_len);
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- /* Are non-contiguous bitmaps allowed? */
- if (!r->cache.arch_has_sparse_bitmaps &&
+ /* Are non-contiguous bitmasks allowed? */
+ if (!r->cache.arch_has_sparse_bitmasks &&
(find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
return false;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 85ceaf9a3..52e7e7dee 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -18,7 +18,6 @@
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
-#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
@@ -59,6 +58,7 @@ struct rdt_fs_context {
bool enable_cdpl2;
bool enable_cdpl3;
bool enable_mba_mbps;
+ bool enable_debug;
};
static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
@@ -243,18 +243,17 @@ struct rdtgroup {
*/
#define RFTYPE_INFO BIT(0)
#define RFTYPE_BASE BIT(1)
-#define RF_CTRLSHIFT 4
-#define RF_MONSHIFT 5
-#define RF_TOPSHIFT 6
-#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
-#define RFTYPE_MON BIT(RF_MONSHIFT)
-#define RFTYPE_TOP BIT(RF_TOPSHIFT)
+#define RFTYPE_CTRL BIT(4)
+#define RFTYPE_MON BIT(5)
+#define RFTYPE_TOP BIT(6)
#define RFTYPE_RES_CACHE BIT(8)
#define RFTYPE_RES_MB BIT(9)
-#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
-#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
-#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_DEBUG BIT(10)
+#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
+#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
/* List of all resource groups */
extern struct list_head rdt_all_groups;
@@ -270,7 +269,7 @@ void __exit rdtgroup_exit(void);
* @mode: Access mode
* @kf_ops: File operations
* @flags: File specific RFTYPE_FLAGS_* flags
- * @fflags: File specific RF_* or RFTYPE_* flags
+ * @fflags: File specific RFTYPE_* flags
* @seq_show: Show content of the file
* @write: Write to the file
*/
@@ -296,14 +295,10 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain
* @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps
- * @delta_bw: Difference between the current and previous bandwidth
- * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 prev_bw_bytes;
u32 prev_bw;
- u32 delta_bw;
- bool delta_comp;
};
/**
@@ -395,6 +390,8 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
+ * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
+ * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -409,6 +406,7 @@ struct rdt_hw_resource {
struct rdt_resource *r);
unsigned int mon_scale;
unsigned int mbm_width;
+ unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -492,6 +490,15 @@ union cpuid_0x10_3_eax {
unsigned int full;
};
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
/* CPUID.(EAX=10H, ECX=ResID).EDX */
union cpuid_0x10_x_edx {
struct {
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac046..3a6c06961 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -440,9 +440,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
cur_bw = bytes / SZ_1M;
- if (m->delta_comp)
- m->delta_bw = abs(cur_bw - m->prev_bw);
- m->delta_comp = false;
m->prev_bw = cur_bw;
}
@@ -520,11 +517,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
- u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
struct rdt_domain *dom_mba;
struct list_head *head;
struct rdtgroup *entry;
+ u32 cur_bw, user_bw;
if (!is_mbm_local_enabled())
return;
@@ -543,7 +540,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
cur_bw = pmbm_data->prev_bw;
user_bw = dom_mba->mbps_val[closid];
- delta_bw = pmbm_data->delta_bw;
/* MBA resource doesn't support CDP */
cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
@@ -555,49 +551,31 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
list_for_each_entry(entry, head, mon.crdtgrp_list) {
cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
cur_bw += cmbm_data->prev_bw;
- delta_bw += cmbm_data->delta_bw;
}
/*
* Scale up/down the bandwidth linearly for the ctrl group. The
* bandwidth step is the bandwidth granularity specified by the
* hardware.
- *
- * The delta_bw is used when increasing the bandwidth so that we
- * dont alternately increase and decrease the control values
- * continuously.
- *
- * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
- * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
- * switching between 90 and 110 continuously if we only check
- * cur_bw < user_bw.
+ * Always increase throttling if current bandwidth is above the
+ * target set by user.
+ * But avoid thrashing up and down on every poll by checking
+ * whether a decrease in throttling is likely to push the group
+ * back over target. E.g. if currently throttling to 30% of bandwidth
+ * on a system with 10% granularity steps, check whether moving to
+ * 40% would go past the limit by multiplying current bandwidth by
+ * "(30 + 10) / 30".
*/
if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
} else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw + delta_bw))) {
+ (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
} else {
return;
}
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-
- /*
- * Delta values are updated dynamically package wise for each
- * rdtgrp every time the throttle MSR changes value.
- *
- * This is because (1)the increase in bandwidth is not perfectly
- * linear and only "approximately" linear even when the hardware
- * says it is linear.(2)Also since MBA is a core specific
- * mechanism, the delta values vary based on number of cores used
- * by the rdtgrp.
- */
- pmbm_data->delta_comp = true;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cmbm_data->delta_comp = true;
- }
}
static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
@@ -813,6 +791,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return ret;
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
mbm_config_rftype_init("mbm_total_bytes_config");
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 725344048..2b69e560b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata;
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
+static void rdtgroup_destroy_root(void);
+
struct dentry *debugfs_resctrl;
+static bool resctrl_debug;
+
void rdt_last_cmd_clear(void)
{
lockdep_assert_held(&rdtgroup_mutex);
@@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct rdtgroup *rdtgrp;
+ char *pid_str;
int ret = 0;
pid_t pid;
- if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
@@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
goto unlock;
}
- ret = rdtgroup_move_task(pid, rdtgrp, of);
+ while (buf && buf[0] != '\0' && buf[0] != '\n') {
+ pid_str = strim(strsep(&buf, ","));
+
+ if (kstrtoint(pid_str, 0, &pid)) {
+ rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pid < 0) {
+ rdt_last_cmd_printf("Invalid pid %d\n", pid);
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ if (ret) {
+ rdt_last_cmd_printf("Error while processing task %d\n", pid);
+ break;
+ }
+ }
unlock:
rdtgroup_kn_unlock(of->kn);
@@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+static int rdtgroup_closid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->closid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdtgroup_rmid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->mon.rmid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
#ifdef CONFIG_PROC_CPU_RESCTRL
/*
@@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
return 0;
}
-/**
+/*
* rdt_bit_usage_show - Display current usage of resources
*
* A domain is a shared resource that can now be allocated differently. Here
@@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
}
}
+static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = of->kn->parent->priv;
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
+
+ return 0;
+}
+
/**
* __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
* @r: Resource to which domain instance @d belongs.
* @d: The domain instance for which @closid is being tested.
* @cbm: Capacity bitmask being tested.
* @closid: Intended closid for @cbm.
+ * @type: CDP type of @r.
* @exclusive: Only check if overlaps with exclusive resource groups
*
* Checks if provided @cbm intended to be used for @closid on domain
@@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
/**
* rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ * @rdtgrp: Resource group identified through its closid.
*
* An exclusive resource group implies that there should be no sharing of
* its allocated resources. At the time this group is considered to be
@@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
return true;
}
-/**
+/*
* rdtgroup_mode_write - Modify the resource group's mode
- *
*/
static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
@@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
return size;
}
-/**
+/*
* rdtgroup_size_show - Display size in bytes of allocated regions
*
* The "size" file mirrors the layout of the "schemata" file, printing the
* size in bytes of each region instead of the capacity bitmask.
- *
*/
static int rdtgroup_size_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
@@ -1553,12 +1620,6 @@ static int mbm_config_write_domain(struct rdt_resource *r,
struct mon_config_info mon_info = {0};
int ret = 0;
- /* mon_config cannot be more than the supported set of events */
- if (val > MAX_EVT_CONFIG_BITS) {
- rdt_last_cmd_puts("Invalid event configuration\n");
- return -EINVAL;
- }
-
/*
* Read the current config value first. If both are the same then
* no need to write it again.
@@ -1596,6 +1657,7 @@ out:
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
struct rdt_domain *d;
@@ -1619,6 +1681,13 @@ next:
return -EINVAL;
}
+ /* Value from user cannot be more than the supported set of events */
+ if ((val & hw_res->mbm_cfg_mask) != val) {
+ rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+ hw_res->mbm_cfg_mask);
+ return -EINVAL;
+ }
+
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
ret = mbm_config_write_domain(r, d, evtid, val);
@@ -1686,77 +1755,77 @@ static struct rftype res_common_files[] = {
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_last_cmd_status_show,
- .fflags = RF_TOP_INFO,
+ .fflags = RFTYPE_TOP_INFO,
},
{
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_closids_show,
- .fflags = RF_CTRL_INFO,
+ .fflags = RFTYPE_CTRL_INFO,
},
{
.name = "mon_features",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_mon_features_show,
- .fflags = RF_MON_INFO,
+ .fflags = RFTYPE_MON_INFO,
},
{
.name = "num_rmids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_num_rmids_show,
- .fflags = RF_MON_INFO,
+ .fflags = RFTYPE_MON_INFO,
},
{
.name = "cbm_mask",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_default_ctrl_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_cbm_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_cbm_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "shareable_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_shareable_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "bit_usage",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bit_usage_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
.name = "min_bandwidth",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_min_bw_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "bandwidth_gran",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_bw_gran_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
{
.name = "delay_linear",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdt_delay_linear_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
},
/*
* Platform specific which (if any) capabilities are provided by
@@ -1775,7 +1844,7 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = max_threshold_occ_write,
.seq_show = max_threshold_occ_show,
- .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
},
{
.name = "mbm_total_bytes_config",
@@ -1817,12 +1886,19 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_BASE,
},
{
+ .name = "mon_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_rmid_show,
+ .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
+ },
+ {
.name = "schemata",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_schemata_write,
.seq_show = rdtgroup_schemata_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
},
{
.name = "mode",
@@ -1830,14 +1906,28 @@ static struct rftype res_common_files[] = {
.kf_ops = &rdtgroup_kf_single_ops,
.write = rdtgroup_mode_write,
.seq_show = rdtgroup_mode_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
},
{
.name = "size",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
.seq_show = rdtgroup_size_show,
- .fflags = RF_CTRL_BASE,
+ .fflags = RFTYPE_CTRL_BASE,
+ },
+ {
+ .name = "sparse_masks",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_has_sparse_bitmasks_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "ctrl_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_closid_show,
+ .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
},
};
@@ -1852,6 +1942,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
lockdep_assert_held(&rdtgroup_mutex);
+ if (resctrl_debug)
+ fflags |= RFTYPE_DEBUG;
+
for (rft = rfts; rft < rfts + len; rft++) {
if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
ret = rdtgroup_add_file(kn, rft);
@@ -1894,7 +1987,7 @@ void __init thread_throttle_mode_init(void)
if (!rft)
return;
- rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+ rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
}
void __init mbm_config_rftype_init(const char *config)
@@ -1903,7 +1996,7 @@ void __init mbm_config_rftype_init(const char *config)
rft = rdtgroup_get_rftype_by_name(config);
if (rft)
- rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+ rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
}
/**
@@ -2038,21 +2131,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+ ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
if (ret)
goto out_destroy;
/* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- fflags = r->fflags | RF_CTRL_INFO;
+ fflags = r->fflags | RFTYPE_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RF_MON_INFO;
+ fflags = r->fflags | RFTYPE_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
@@ -2271,14 +2364,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
return 0;
}
-static void cdp_disable_all(void)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-}
-
/*
* We don't allow rdtgroup directories to be created anywhere
* except the root directory. Thus when looking for the rdtgroup
@@ -2358,19 +2443,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
struct rdtgroup *prgrp,
struct kernfs_node **mon_data_kn);
+static void rdt_disable_ctx(void)
+{
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+ set_mba_sc(false);
+
+ resctrl_debug = false;
+}
+
static int rdt_enable_ctx(struct rdt_fs_context *ctx)
{
int ret = 0;
- if (ctx->enable_cdpl2)
+ if (ctx->enable_cdpl2) {
ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
+ if (ret)
+ goto out_done;
+ }
- if (!ret && ctx->enable_cdpl3)
+ if (ctx->enable_cdpl3) {
ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
+ if (ret)
+ goto out_cdpl2;
+ }
- if (!ret && ctx->enable_mba_mbps)
+ if (ctx->enable_mba_mbps) {
ret = set_mba_sc(true);
+ if (ret)
+ goto out_cdpl3;
+ }
+
+ if (ctx->enable_debug)
+ resctrl_debug = true;
+ return 0;
+
+out_cdpl3:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+out_cdpl2:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+out_done:
return ret;
}
@@ -2463,6 +2576,7 @@ static void schemata_list_destroy(void)
static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ unsigned long flags = RFTYPE_CTRL_BASE;
struct rdt_domain *dom;
struct rdt_resource *r;
int ret;
@@ -2477,18 +2591,31 @@ static int rdt_get_tree(struct fs_context *fc)
goto out;
}
+ ret = rdtgroup_setup_root(ctx);
+ if (ret)
+ goto out;
+
ret = rdt_enable_ctx(ctx);
- if (ret < 0)
- goto out_cdp;
+ if (ret)
+ goto out_root;
ret = schemata_list_create();
if (ret) {
schemata_list_destroy();
- goto out_mba;
+ goto out_ctx;
}
closid_init();
+ if (rdt_mon_capable)
+ flags |= RFTYPE_MON;
+
+ ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
+ if (ret)
+ goto out_schemata_free;
+
+ kernfs_activate(rdtgroup_default.kn);
+
ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
if (ret < 0)
goto out_schemata_free;
@@ -2543,11 +2670,10 @@ out_info:
kernfs_remove(kn_info);
out_schemata_free:
schemata_list_destroy();
-out_mba:
- if (ctx->enable_mba_mbps)
- set_mba_sc(false);
-out_cdp:
- cdp_disable_all();
+out_ctx:
+ rdt_disable_ctx();
+out_root:
+ rdtgroup_destroy_root();
out:
rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
@@ -2559,6 +2685,7 @@ enum rdt_param {
Opt_cdp,
Opt_cdpl2,
Opt_mba_mbps,
+ Opt_debug,
nr__rdt_params
};
@@ -2566,6 +2693,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
+ fsparam_flag("debug", Opt_debug),
{}
};
@@ -2591,6 +2719,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
ctx->enable_mba_mbps = true;
return 0;
+ case Opt_debug:
+ ctx->enable_debug = true;
+ return 0;
}
return -EINVAL;
@@ -2618,7 +2749,6 @@ static int rdt_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
- ctx->kfc.root = rdt_root;
ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
fc->fs_private = &ctx->kfc;
fc->ops = &rdt_fs_context_ops;
@@ -2779,16 +2909,16 @@ static void rdt_kill_sb(struct super_block *sb)
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- set_mba_sc(false);
+ rdt_disable_ctx();
/*Put everything back to default values. */
for_each_alloc_capable_rdt_resource(r)
reset_all_ctrls(r);
- cdp_disable_all();
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
schemata_list_destroy();
+ rdtgroup_destroy_root();
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -3170,8 +3300,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
enum rdt_group_type rtype, struct rdtgroup **r)
{
struct rdtgroup *prdtgrp, *rdtgrp;
+ unsigned long files = 0;
struct kernfs_node *kn;
- uint files = 0;
int ret;
prdtgrp = rdtgroup_kn_lock_live(parent_kn);
@@ -3223,7 +3353,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_destroy;
}
- files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ if (rtype == RDTCTRL_GROUP) {
+ files = RFTYPE_BASE | RFTYPE_CTRL;
+ if (rdt_mon_capable)
+ files |= RFTYPE_MON;
+ } else {
+ files = RFTYPE_BASE | RFTYPE_MON;
+ }
+
ret = rdtgroup_add_files(kn, files);
if (ret) {
rdt_last_cmd_puts("kernfs fill error\n");
@@ -3656,6 +3793,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
seq_puts(seq, ",mba_MBps");
+ if (resctrl_debug)
+ seq_puts(seq, ",debug");
+
return 0;
}
@@ -3666,10 +3806,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
.show_options = rdtgroup_show_options,
};
-static int __init rdtgroup_setup_root(void)
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
{
- int ret;
-
rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
@@ -3677,6 +3815,20 @@ static int __init rdtgroup_setup_root(void)
if (IS_ERR(rdt_root))
return PTR_ERR(rdt_root);
+ ctx->kfc.root = rdt_root;
+ rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
+
+ return 0;
+}
+
+static void rdtgroup_destroy_root(void)
+{
+ kernfs_destroy_root(rdt_root);
+ rdtgroup_default.kn = NULL;
+}
+
+static void __init rdtgroup_setup_default(void)
+{
mutex_lock(&rdtgroup_mutex);
rdtgroup_default.closid = 0;
@@ -3686,19 +3838,7 @@ static int __init rdtgroup_setup_root(void)
list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
- ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE);
- if (ret) {
- kernfs_destroy_root(rdt_root);
- goto out;
- }
-
- rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
- kernfs_activate(rdtgroup_default.kn);
-
-out:
mutex_unlock(&rdtgroup_mutex);
-
- return ret;
}
static void domain_destroy_mon_state(struct rdt_domain *d)
@@ -3820,13 +3960,11 @@ int __init rdtgroup_init(void)
seq_buf_init(&last_cmd_status, last_cmd_status_buf,
sizeof(last_cmd_status_buf));
- ret = rdtgroup_setup_root();
- if (ret)
- return ret;
+ rdtgroup_setup_default();
ret = sysfs_create_mount_point(fs_kobj, "resctrl");
if (ret)
- goto cleanup_root;
+ return ret;
ret = register_filesystem(&rdt_fs_type);
if (ret)
@@ -3859,8 +3997,6 @@ int __init rdtgroup_init(void)
cleanup_mountpoint:
sysfs_remove_mount_point(fs_kobj, "resctrl");
-cleanup_root:
- kernfs_destroy_root(rdt_root);
return ret;
}
@@ -3870,5 +4006,4 @@ void __exit rdtgroup_exit(void)
debugfs_remove_recursive(debugfs_resctrl);
unregister_filesystem(&rdt_fs_type);
sysfs_remove_mount_point(fs_kobj, "resctrl");
- kernfs_destroy_root(rdt_root);
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 0dad49a09..0ebca40df 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
{ X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 5d390df21..b65ab214b 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -581,7 +581,7 @@ err_out:
*
* Flush any outstanding enqueued EADD operations and perform EINIT. The
* Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
- * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
*
* Return:
* - 0: Success.
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 0270925fe..dc1367035 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
/*
* initial apic id, which also represents 32-bit extended x2apic id.
*/
- c->initial_apicid = edx;
+ c->topo.initial_apicid = edx;
smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
#endif
return 0;
@@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
* Populate HT related information from sub-leaf level 0.
*/
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- c->initial_apicid = edx;
+ c->topo.initial_apicid = edx;
core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
die_select_mask = (~(-1 << die_plus_mask_width)) >>
core_plus_mask_width;
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
+ c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid,
ht_mask_width) & core_select_mask;
if (die_level_present) {
- c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
+ c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid,
core_plus_mask_width) & die_select_mask;
}
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
- pkg_mask_width);
+ c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width);
/*
* Reinit the apicid, now that we have extended initial_apicid.
*/
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
__max_die_per_package = (die_level_siblings / core_level_siblings);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 05fa4ef63..415564a65 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
-
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
}
static void init_zhaoxin(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c92d88680..b6b044356 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -170,7 +170,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
/* Exclude the low 1M because it is always reserved */
- ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
if (ret)
return ret;
@@ -198,8 +198,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
}
/* Prepare elf headers. Return addr and size */
-static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz, unsigned long *nr_mem_ranges)
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
int ret;
@@ -221,7 +221,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
*nr_mem_ranges = cmem->nr_ranges;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
out:
vfree(cmem);
@@ -349,7 +349,7 @@ int crash_load_segments(struct kimage *image)
.buf_max = ULONG_MAX, .top_down = false };
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum);
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
if (ret)
return ret;
@@ -386,8 +386,8 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
image->elf_load_addr = kbuf.mem;
- pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
@@ -452,7 +452,7 @@ void arch_crash_handle_hotplug_event(struct kimage *image)
* Create the new elfcorehdr reflecting the changes to CPU and/or
* memory resources.
*/
- if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) {
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
pr_err("unable to create new elfcorehdr");
goto out;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 87d38f17f..afd099240 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void)
}
#ifdef CONFIG_OF_EARLY_FLATTREE
-static void __init x86_flattree_get_config(void)
+void __init x86_flattree_get_config(void)
{
u32 size, map_len;
void *dt;
@@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void)
unflatten_and_copy_device_tree();
early_memunmap(dt, map_len);
}
-#else
-static inline void x86_flattree_get_config(void) { }
#endif
void __init x86_dtb_init(void)
{
- x86_flattree_get_config();
-
if (!of_have_populated_dt())
return;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index fb8cf9533..b66f540de 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1017,10 +1017,12 @@ void __init e820__reserve_setup_data(void)
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
/*
- * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need
- * to be reserved.
+ * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by
+ * kexec and do not need to be reserved.
*/
- if (data->type != SETUP_EFI && data->type != SETUP_IMA)
+ if (data->type != SETUP_EFI &&
+ data->type != SETUP_IMA &&
+ data->type != SETUP_RNG_SEED)
e820__range_update_kexec(pa_data,
sizeof(*data) + data->len,
E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a6c1867fc..59f4aefc6 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -779,13 +779,13 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
- if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) {
sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
if (sec > num)
early_pci_scan_bus(sec);
}
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
return -1;
return 0;
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index e963344b0..53935b4d6 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -2,6 +2,7 @@
/*
* EISA specific code
*/
+#include <linux/cc_platform.h>
#include <linux/ioport.h>
#include <linux/eisa.h>
#include <linux/io.h>
@@ -12,7 +13,7 @@ static __init int eisa_bus_probe(void)
{
void __iomem *p;
- if (xen_pv_domain() && !xen_initial_domain())
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return 0;
p = ioremap(0x0FFFD9, 4);
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 794e70151..a06b876bb 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,6 +2,7 @@
/*
* x86 FPU bug checks:
*/
+#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
/*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a21a4d0ec..520deb411 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
* Must be invoked from KVM after a VMEXIT before enabling interrupts when
* XFD write emulation is disabled. This is required because the guest can
* freely modify XFD and the state at VMEXIT is not guaranteed to be the
- * same as the state on VMENTER. So software state has to be udpated before
+ * same as the state on VMENTER. So software state has to be updated before
* any operation which depends on it can take place.
*
* Note: It can be invoked unconditionally even when write emulation is
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 558076dbd..247f2225a 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -274,12 +274,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
-static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
struct fpu *fpu = &current->thread.fpu;
int ret;
+ /* Restore enabled features only. */
+ xrestore &= fpu->fpstate->user_xfeatures;
retry:
fpregs_lock();
/* Ensure that XFD is up to date */
@@ -309,7 +310,7 @@ retry:
if (ret != X86_TRAP_PF)
return false;
- if (!fault_in_readable(buf, size))
+ if (!fault_in_readable(buf, fpu->fpstate->user_size))
goto retry;
return false;
}
@@ -339,7 +340,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
struct user_i387_ia32_struct env;
bool success, fx_only = false;
union fpregs_state *fpregs;
- unsigned int state_size;
u64 user_xfeatures = 0;
if (use_xsave()) {
@@ -349,17 +349,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
return false;
fx_only = !fx_sw_user.magic1;
- state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
- state_size = fpu->fpstate->user_size;
}
if (likely(!ia32_fxstate)) {
/* Restore the FPU registers directly from user memory. */
- return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
- state_size);
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ef6906107..33a214b1a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -178,10 +178,11 @@ void fpu__init_cpu_xstate(void)
* Must happen after CR4 setup and before xsetbv() to allow KVM
* lazy passthrough. Write independent of the dynamic state static
* key as that does not work on the boot CPU. This also ensures
- * that any stale state is wiped out from XFD.
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
*/
if (cpu_feature_enabled(X86_FEATURE_XFD))
- wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+ xfd_set_state(init_fpstate.xfd);
/*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
@@ -1736,7 +1737,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
/**
* fpu_xstate_prctl - xstate permission operations
- * @tsk: Redundant pointer to current
* @option: A subfunction of arch_prctl()
* @arg2: option argument
* Return: 0 if successful; otherwise, an error code
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3518fb26d..19ca623ff 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -148,20 +148,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#endif
#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
static inline void xfd_update_state(struct fpstate *fpstate)
{
if (fpu_state_size_dynamic()) {
u64 xfd = fpstate->xfd;
- if (__this_cpu_read(xfd_state) != xfd) {
- wrmsrl(MSR_IA32_XFD, xfd);
- __this_cpu_write(xfd_state, xfd);
- }
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
}
}
extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
+static inline void xfd_set_state(u64 xfd) { }
+
static inline void xfd_update_state(struct fpstate *fpstate) { }
static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 24c1175a4..58d9ed50f 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -3,10 +3,10 @@
* Copyright (C) 2017 Steven Rostedt, VMware Inc.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/page_types.h>
#include <asm/segment.h>
-#include <asm/export.h>
#include <asm/ftrace.h>
#include <asm/nospec-branch.h>
#include <asm/frame.h>
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 945cfa5f7..214f30e9f 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -3,12 +3,12 @@
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
+#include <linux/export.h>
#include <linux/cfi_types.h>
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/ftrace.h>
-#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/frame.h>
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 246a609f8..de001b214 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -19,6 +19,7 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/microcode.h>
#include <asm/tlbflush.h>
#include <asm/bootparam_utils.h>
@@ -29,11 +30,33 @@ static void __init i386_default_early_setup(void)
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
}
+#ifdef CONFIG_MICROCODE_INITRD32
+unsigned long __initdata initrd_start_early;
+static pte_t __initdata *initrd_pl2p_start, *initrd_pl2p_end;
+
+static void zap_early_initrd_mapping(void)
+{
+ pte_t *pl2p = initrd_pl2p_start;
+
+ for (; pl2p < initrd_pl2p_end; pl2p++) {
+ *pl2p = (pte_t){ .pte = 0 };
+
+ if (!IS_ENABLED(CONFIG_X86_PAE))
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = (pte_t) {.pte = 0};
+ }
+}
+#else
+static inline void zap_early_initrd_mapping(void) { }
+#endif
+
asmlinkage __visible void __init __noreturn i386_start_kernel(void)
{
/* Make sure IDT is set up before any exception happens */
idt_setup_early_handler();
+ load_ucode_bsp();
+ zap_early_initrd_mapping();
+
cr4_init_shadow();
sanitize_boot_params(&boot_params);
@@ -69,52 +92,83 @@ asmlinkage __visible void __init __noreturn i386_start_kernel(void)
* to the first kernel PMD. Note the upper half of each PMD or PTE are
* always zero at this stage.
*/
-void __init mk_early_pgtbl_32(void);
-void __init mk_early_pgtbl_32(void)
-{
-#ifdef __pa
-#undef __pa
-#endif
-#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
- pte_t pte, *ptep;
- int i;
- unsigned long *ptr;
- /* Enough space to fit pagetables for the low memory linear map */
- const unsigned long limit = __pa(_end) +
- (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
#ifdef CONFIG_X86_PAE
- pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
-#define SET_PL2(pl2, val) { (pl2).pmd = (val); }
+typedef pmd_t pl2_t;
+#define pl2_base initial_pg_pmd
+#define SET_PL2(val) { .pmd = (val), }
#else
- pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
-#define SET_PL2(pl2, val) { (pl2).pgd = (val); }
+typedef pgd_t pl2_t;
+#define pl2_base initial_page_table
+#define SET_PL2(val) { .pgd = (val), }
#endif
- ptep = (pte_t *)__pa(__brk_base);
- pte.pte = PTE_IDENT_ATTR;
-
+static __init __no_stack_protector pte_t init_map(pte_t pte, pte_t **ptep, pl2_t **pl2p,
+ const unsigned long limit)
+{
while ((pte.pte & PTE_PFN_MASK) < limit) {
+ pl2_t pl2 = SET_PL2((unsigned long)*ptep | PDE_IDENT_ATTR);
+ int i;
+
+ **pl2p = pl2;
+ if (!IS_ENABLED(CONFIG_X86_PAE)) {
+ /* Kernel PDE entry */
+ *(*pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+ }
- SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
- *pl2p = pl2;
-#ifndef CONFIG_X86_PAE
- /* Kernel PDE entry */
- *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
-#endif
for (i = 0; i < PTRS_PER_PTE; i++) {
- *ptep = pte;
+ **ptep = pte;
pte.pte += PAGE_SIZE;
- ptep++;
+ (*ptep)++;
}
-
- pl2p++;
+ (*pl2p)++;
}
+ return pte;
+}
+
+void __init __no_stack_protector mk_early_pgtbl_32(void)
+{
+ /* Enough space to fit pagetables for the low memory linear map */
+ unsigned long limit = __pa_nodebug(_end) + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+ pte_t pte, *ptep = (pte_t *)__pa_nodebug(__brk_base);
+ struct boot_params __maybe_unused *params;
+ pl2_t *pl2p = (pl2_t *)__pa_nodebug(pl2_base);
+ unsigned long *ptr;
+
+ pte.pte = PTE_IDENT_ATTR;
+ pte = init_map(pte, &ptep, &pl2p, limit);
- ptr = (unsigned long *)__pa(&max_pfn_mapped);
+ ptr = (unsigned long *)__pa_nodebug(&max_pfn_mapped);
/* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
*ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
- ptr = (unsigned long *)__pa(&_brk_end);
+ ptr = (unsigned long *)__pa_nodebug(&_brk_end);
*ptr = (unsigned long)ptep + PAGE_OFFSET;
-}
+#ifdef CONFIG_MICROCODE_INITRD32
+ /* Running on a hypervisor? */
+ if (native_cpuid_ecx(1) & BIT(31))
+ return;
+
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
+ return;
+
+ /* Save the virtual start address */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_start_early);
+ *ptr = (pte.pte & PTE_PFN_MASK) + PAGE_OFFSET;
+ *ptr += ((unsigned long)params->hdr.ramdisk_image) & ~PAGE_MASK;
+
+ /* Save PLP2 for cleanup */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_start);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+
+ limit = (unsigned long)params->hdr.ramdisk_image;
+ pte.pte = PTE_IDENT_ATTR | PFN_ALIGN(limit);
+ limit = (unsigned long)params->hdr.ramdisk_image + params->hdr.ramdisk_size;
+
+ init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_end);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+#endif
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index bbc21798d..dc0956067 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -41,6 +41,7 @@
#include <asm/trapnr.h>
#include <asm/sev.h>
#include <asm/tdx.h>
+#include <asm/init.h>
/*
* Manage page tables very early on.
@@ -69,23 +70,21 @@ EXPORT_SYMBOL(vmemmap_base);
/*
* GDT used on the boot CPU before switching to virtual addresses.
*/
-static struct desc_struct startup_gdt[GDT_ENTRIES] = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
};
/*
* Address needs to be set at runtime because it references the startup_gdt
* while the kernel still uses a direct mapping.
*/
-static struct desc_ptr startup_gdt_descr = {
+static struct desc_ptr startup_gdt_descr __initdata = {
.size = sizeof(startup_gdt)-1,
.address = 0,
};
-#define __head __section(".head.text")
-
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
{
return ptr - (void *)_text + (void *)physaddr;
@@ -211,7 +210,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(&early_top_pgt, physaddr);
+ pgd = fixup_pointer(early_top_pgt, physaddr);
p = pgd + pgd_index(__START_KERNEL_map);
if (la57)
*p = (unsigned long)level4_kernel_pgt;
@@ -220,11 +219,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
if (la57) {
- p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d = fixup_pointer(level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}
- pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud = fixup_pointer(level3_kernel_pgt, physaddr);
pud[510] += load_delta;
pud[511] += load_delta;
@@ -588,7 +587,7 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
}
/* This runs while still in the direct mapping */
-static void startup_64_load_idt(unsigned long physbase)
+static void __head startup_64_load_idt(unsigned long physbase)
{
struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c9318993f..487ac57e2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -8,6 +8,7 @@
*/
.text
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <linux/linkage.h>
@@ -25,7 +26,6 @@
#include <asm/nops.h>
#include <asm/nospec-branch.h>
#include <asm/bootparam.h>
-#include <asm/export.h>
#include <asm/pgtable_32.h>
/* Physical address */
@@ -118,11 +118,6 @@ SYM_CODE_START(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
-#ifdef CONFIG_MICROCODE
- /* Early load ucode on BSP. */
- call load_ucode_bsp
-#endif
-
/* Create early pagetables. */
call mk_early_pgtbl_32
@@ -157,11 +152,6 @@ SYM_FUNC_START(startup_32_smp)
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#ifdef CONFIG_MICROCODE
- /* Early load ucode on AP. */
- call load_ucode_ap
-#endif
-
.Ldefault_entry:
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e6eaee850..d4918d03e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -9,7 +9,7 @@
* Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
*/
-
+#include <linux/export.h>
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
@@ -22,7 +22,6 @@
#include <asm/percpu.h>
#include <asm/nops.h>
#include "../entry/calling.h"
-#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/apicdef.h>
#include <asm/fixmap.h>
@@ -115,6 +114,28 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
+ mov %rax, %r14
+
+ addq phys_base(%rip), %rdi
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
+
+ /*
+ * Restore CR3 value without the phys_base which will be added
+ * below, before writing %cr3.
+ */
+ mov %r14, %rax
+#endif
+
jmp 1f
SYM_CODE_END(startup_64)
@@ -180,10 +201,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl $0, %ecx
#endif
- /* Enable PAE mode, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE mode, PSE, PGE and LA57 */
+ orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
- testl $1, __pgtable_l5_enabled(%rip)
+ testb $1, __pgtable_l5_enabled(%rip)
jz 1f
orl $X86_CR4_LA57, %ecx
1:
@@ -194,21 +215,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
addq phys_base(%rip), %rax
/*
- * For SEV guests: Verify that the C-bit is correct. A malicious
- * hypervisor could lie about the C-bit position to perform a ROP
- * attack on the guest by writing to the unencrypted stack and wait for
- * the next RET instruction.
- */
- movq %rax, %rdi
- call sev_verify_cbit
-
- /*
* Switch to new page-table
*
* For the boot CPU this switches to early_top_pgt which still has the
- * indentity mappings present. The secondary CPUs will switch to the
+ * identity mappings present. The secondary CPUs will switch to the
* init_top_pgt here, away from the trampoline_pgd and unmap the
- * indentity mapped ranges.
+ * identity mapped ranges.
*/
movq %rax, %cr3
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 046bc9d57..a38d0c93a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -52,7 +52,7 @@ unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
bool hpet_msi_disable;
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
static struct irq_domain *hpet_domain;
#endif
@@ -469,7 +469,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
/*
* HPET MSI Support
*/
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
static void hpet_msi_unmask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
@@ -707,7 +707,7 @@ static void __init hpet_select_clockevents(void)
hpet_base.nr_clockevents = 0;
- /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
return;
@@ -965,7 +965,7 @@ static bool __init mwait_pc10_supported(void)
* and per CPU timer interrupts.
*
* The probability that this problem is going to be solved in the
- * forseeable future is close to zero, so the kernel has to be cluttered
+ * foreseeable future is close to zero, so the kernel has to be cluttered
* with heuristics to keep up with the ever growing amount of hardware and
* firmware trainwrecks. Hopefully some day hardware people will understand
* that the approach of "This can be fixed in software" is not sustainable.
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index fc77a9604..660b601f1 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -10,6 +10,7 @@
#include <asm/proto.h>
#include <asm/desc.h>
#include <asm/hw_irq.h>
+#include <asm/ia32.h>
#include <asm/idtentry.h>
#define DPL0 0x0
@@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = {
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
+};
+
+static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
#elif defined(CONFIG_X86_32)
@@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void)
void __init idt_setup_traps(void)
{
idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+
+ if (ia32_enabled())
+ idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index aaf9e776f..7f542a779 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/asm.h>
-#include <asm/export.h>
+#include <linux/export.h>
#include <linux/linkage.h>
/*
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index ee4fe8cdb..9a7c03d47 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -74,7 +74,6 @@ static struct ctl_table itmt_kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static struct ctl_table_header *itmt_sysctl_header;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01..2a422e00e 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
cmdline_ptr[cmdline_len - 1] = '\0';
- pr_debug("Final command line is: %s\n", cmdline_ptr);
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
cmdline_ext_32 = cmdline_ptr_phys >> 32;
@@ -272,7 +272,12 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
nr_e820_entries = params->e820_entries;
+ kexec_dprintk("E820 memmap:\n");
for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
if (params->e820_table[i].type != E820_TYPE_RAM)
continue;
start = params->e820_table[i].addr;
@@ -424,7 +429,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* command line. Make sure it does not overflow
*/
if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
- pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
return ERR_PTR(-EINVAL);
}
@@ -445,7 +450,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ERR_PTR(ret);
}
- pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
/*
@@ -490,8 +495,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (ret)
goto out_free_params;
bootparam_load_addr = kbuf.mem;
- pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load kernel */
kbuf.buffer = kernel + kern16_size;
@@ -505,8 +510,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
kernel_load_addr = kbuf.mem;
- pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
@@ -520,8 +525,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
initrd_load_addr = kbuf.mem;
- pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- initrd_load_addr, initrd_len, initrd_len);
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
setup_initrd(params, initrd_load_addr, initrd_len);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a0ce46c0a..a6a3475e1 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -335,7 +335,16 @@ out:
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- if (is_endbr(*(u32 *)addr)) {
+ u32 insn;
+
+ /*
+ * Since 'addr' is not guaranteed to be safe to access, use
+ * copy_from_kernel_nofault() to read the instruction:
+ */
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
+ return NULL;
+
+ if (is_endbr(insn)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee58..428ee7400 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -434,7 +434,8 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ if (cc_vendor != CC_VENDOR_AMD ||
+ !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
@@ -500,13 +501,13 @@ static bool pv_sched_yield_supported(void)
static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
- int cpu, apic_id, icr;
- int min = 0, max = 0;
+ int cpu, min = 0, max = 0;
#ifdef CONFIG_X86_64
__uint128_t ipi_bitmap = 0;
#else
u64 ipi_bitmap = 0;
#endif
+ u32 apic_id, icr;
long ret;
if (cpumask_empty(mask))
@@ -803,8 +804,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
"setne %al\n\t"
-DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
- PV_VCPU_PREEMPTED_ASM, .text);
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
@@ -942,7 +943,7 @@ static void __init kvm_init_platform(void)
* Reset the host's shared pages list related to kernel
* specific page encryption status settings before we load a
* new kernel by kexec. Reset the page encryption status
- * during early boot intead of just before kexec to avoid SMP
+ * during early boot instead of just before kexec to avoid SMP
* races during kvm_pv_guest_cpu_reboot().
* NOTE: We cannot reset the complete shared pages list
* here as we need to retain the UEFI/OVMF firmware
@@ -1028,8 +1029,8 @@ arch_initcall(activate_jump_labels);
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
static void kvm_kick_cpu(int cpu)
{
- int apicid;
unsigned long flags = 0;
+ u32 apicid;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f2fff6255..5bb395551 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
}
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index adc67f988..7a814b414 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,7 +7,7 @@
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
- * contex.ldt_usr_sem
+ * context.ldt_usr_sem
* mmap_lock
* context.lock
*/
@@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm)
/*
* Any change to mm->context.ldt is followed by an IPI to all
* CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
+ * after the IPI is handled by all such CPUs. This means that
* if the ldt_struct changes before we return, the values we see
* will be safe, and the new values will be loaded before we run
* any user code.
@@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
- * return type, but tht ABI for sys_modify_ldt() expects
+ * return type, but the ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 1a3e2c05a..bc0a5348b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -42,12 +42,9 @@ struct init_pgtable_data {
static int mem_region_callback(struct resource *res, void *arg)
{
struct init_pgtable_data *data = arg;
- unsigned long mstart, mend;
-
- mstart = res->start;
- mend = mstart + resource_size(res) - 1;
- return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
static int
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5f71a0cf4..e18914c0e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -276,7 +276,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -286,8 +286,6 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -304,14 +302,6 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
- /*
- * See alternative_instructions() for the ordering rules between the
- * various patching types.
- */
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -341,7 +331,7 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
- if (calls || para) {
+ if (calls || alt) {
struct callthunk_sites cs = {};
if (calls) {
@@ -349,9 +339,9 @@ int module_finalize(const Elf_Ehdr *hdr,
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
}
- if (para) {
- cs.pv_start = (void *)para->sh_addr;
- cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ if (alt) {
+ cs.alt_start = (void *)alt->sh_addr;
+ cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
}
callthunks_patch_module_calls(&cs, me);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4766b6bed..6da2cfa23 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,6 +33,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
#include <asm/sev.h>
#define CREATE_TRACE_POINTS
@@ -343,6 +344,9 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
instrumentation_begin();
+ if (microcode_nmi_handler_enabled() && microcode_nmi_handler())
+ goto out;
+
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -498,8 +502,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
raw_atomic_long_inc(&nsp->idt_calls);
- if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
return;
+ }
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);
@@ -556,9 +563,6 @@ nmi_restart:
}
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
-
- if (user_mode(regs))
- mds_user_clear_cpu_buffers();
}
#if IS_ENABLED(CONFIG_KVM_INTEL)
@@ -632,7 +636,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & ~0x1)
+ if (nmi_seq & 0x1)
msghp = " (CPU currently in NMI handler function)";
else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 97f1436c1..5358d4388 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -34,14 +34,8 @@
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>
-/*
- * nop stub, which must not clobber anything *including the stack* to
- * avoid confusing the entry prologues.
- */
-DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
-
/* stub always returning 0. */
-DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -49,26 +43,12 @@ void __init default_banner(void)
pv_info.name);
}
-/* Undefined instruction for dealing with missing ops pointers. */
-noinstr void paravirt_BUG(void)
-{
- BUG();
-}
-
-static unsigned paravirt_patch_call(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- __text_gen_insn(insn_buff, CALL_INSN_OPCODE,
- (void *)addr, target, CALL_INSN_SIZE);
- return CALL_INSN_SIZE;
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
-DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -85,28 +65,6 @@ static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_remove_page(tlb, table);
}
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- /*
- * Neat trick to map patch type back to the call within the
- * corresponding structure.
- */
- void *opfunc = *((void **)&pv_ops + type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with paravirt_BUG() */
- ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
- else if (opfunc == _paravirt_nop)
- ret = 0;
- else
- /* Otherwise call the function. */
- ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
-
- return ret;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d..cc2c34ba7 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -203,16 +203,6 @@ void __init probe_roms(void)
unsigned char c;
int i;
- /*
- * The ROM memory range is not part of the e820 table and is therefore not
- * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
- * memory, and SNP requires encrypted memory to be validated before access.
- * Do that here.
- */
- snp_prep_memory(video_rom_resource.start,
- ((system_rom_resource.end + 1) - video_rom_resource.start),
- SNP_PAGE_STATE_PRIVATE);
-
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399..ab49ade31 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -477,7 +477,7 @@ void native_tss_update_io_bitmap(void)
/*
* Make sure that the TSS limit is covering the IO bitmap. It might have
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
- * access from user space to trigger a #GP because tbe bitmap is outside
+ * access from user space to trigger a #GP because the bitmap is outside
* the TSS limit.
*/
refresh_tss_limit();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 33b268747..4989095ab 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -138,7 +138,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
log_lvl, d3, d6, d7);
}
- if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ if (cr4 & X86_CR4_PKE)
printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2..399810919 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -9,7 +9,6 @@
#include <linux/console.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
-#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
@@ -36,6 +35,7 @@
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cacheinfo.h>
+#include <asm/coco.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
@@ -226,8 +226,6 @@ static void __init reserve_brk(void)
_brk_start = 0;
}
-u64 relocated_ramdisk;
-
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
@@ -261,7 +259,7 @@ static void __init relocate_initrd(void)
u64 area_size = PAGE_ALIGN(ramdisk_size);
/* We need to move the initrd down into directly mapped mem */
- relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
@@ -466,154 +464,29 @@ static void __init memblock_x86_reserve_range_setup_data(void)
}
}
-/*
- * --------- Crashkernel reservation ------------------------------
- */
-
-/* 16M alignment for crash kernel regions */
-#define CRASH_ALIGN SZ_16M
-
-/*
- * Keep the crash kernel below this limit.
- *
- * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
- * due to mapping restrictions.
- *
- * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
- * the upper limit of system RAM in 4-level paging mode. Since the kdump
- * jump could be from 5-level paging to 4-level paging, the jump will fail if
- * the kernel is put above 64 TB, and during the 1st kernel bootup there's
- * no good way to detect the paging mode of the target kernel which will be
- * loaded for dumping.
- */
-#ifdef CONFIG_X86_32
-# define CRASH_ADDR_LOW_MAX SZ_512M
-# define CRASH_ADDR_HIGH_MAX SZ_512M
-#else
-# define CRASH_ADDR_LOW_MAX SZ_4G
-# define CRASH_ADDR_HIGH_MAX SZ_64T
-#endif
-
-static int __init reserve_crashkernel_low(void)
-{
-#ifdef CONFIG_X86_64
- unsigned long long base, low_base = 0, low_size = 0;
- unsigned long low_mem_limit;
- int ret;
-
- low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
-
- /* crashkernel=Y,low */
- ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
- if (ret) {
- /*
- * two parts from kernel/dma/swiotlb.c:
- * -swiotlb size: user-specified with swiotlb= or default.
- *
- * -swiotlb overflow buffer: now hardcoded to 32k. We round it
- * to 8M for other buffers that may need to stay low too. Also
- * make sure we allocate enough extra low memory so that we
- * don't run out of DMA buffers for 32-bit devices.
- */
- low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
- } else {
- /* passed with crashkernel=0,low ? */
- if (!low_size)
- return 0;
- }
-
- low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
- if (!low_base) {
- pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
- (unsigned long)(low_size >> 20));
- return -ENOMEM;
- }
-
- pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
- (unsigned long)(low_size >> 20),
- (unsigned long)(low_base >> 20),
- (unsigned long)(low_mem_limit >> 20));
-
- crashk_low_res.start = low_base;
- crashk_low_res.end = low_base + low_size - 1;
- insert_resource(&iomem_resource, &crashk_low_res);
-#endif
- return 0;
-}
-
-static void __init reserve_crashkernel(void)
+static void __init arch_reserve_crashkernel(void)
{
- unsigned long long crash_size, crash_base, total_mem;
+ unsigned long long crash_base, crash_size, low_size = 0;
+ char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
- total_mem = memblock_phys_mem_size();
-
- /* crashkernel=XM */
- ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0) {
- /* crashkernel=X,high */
- ret = parse_crashkernel_high(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
- return;
- high = true;
- }
+ ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ &crash_size, &crash_base,
+ &low_size, &high);
+ if (ret)
+ return;
if (xen_pv_domain()) {
pr_info("Ignoring crashkernel for a Xen PV domain\n");
return;
}
- /* 0 means: find the address automatically */
- if (!crash_base) {
- /*
- * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
- * crashkernel=x,high reserves memory over 4G, also allocates
- * 256M extra low memory for DMA buffers and swiotlb.
- * But the extra memory is not required for all machines.
- * So try low memory first and fall back to high memory
- * unless "crashkernel=size[KMG],high" is specified.
- */
- if (!high)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_LOW_MAX);
- if (!crash_base)
- crash_base = memblock_phys_alloc_range(crash_size,
- CRASH_ALIGN, CRASH_ALIGN,
- CRASH_ADDR_HIGH_MAX);
- if (!crash_base) {
- pr_info("crashkernel reservation failed - No suitable area found.\n");
- return;
- }
- } else {
- unsigned long long start;
-
- start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
- crash_base + crash_size);
- if (start != crash_base) {
- pr_info("crashkernel reservation failed - memory is in use.\n");
- return;
- }
- }
-
- if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
- memblock_phys_free(crash_base, crash_size);
- return;
- }
-
- pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+ low_size, high);
}
static struct resource standard_io_resources[] = {
@@ -1029,7 +902,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
reserve_ibft_region();
- dmi_setup();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
@@ -1120,7 +993,8 @@ void __init setup_arch(char **cmdline_p)
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
- sev_setup_arch();
+ mem_encrypt_setup_arch();
+ cc_random_init();
efi_fake_memmap();
efi_find_mirror();
@@ -1158,6 +1032,8 @@ void __init setup_arch(char **cmdline_p)
*
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
+ *
+ * Note the host kernel TDX also requires the first 1MB being reserved.
*/
x86_platform.realmode_reserve();
@@ -1217,6 +1093,8 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
+ x86_flattree_get_config();
+
initmem_init();
dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
@@ -1227,7 +1105,7 @@ void __init setup_arch(char **cmdline_p)
* Reserve memory for crash kernel after SRAT is parsed so that it
* won't consume hotpluggable memory.
*/
- reserve_crashkernel();
+ arch_reserve_crashkernel();
memblock_find_dma_reserve();
@@ -1290,7 +1168,7 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
+ vgacon_register_screen(&screen_info);
#endif
#endif
x86_init.oem.banner();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2c97bf7b5..b30d6e180 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
- 0xFFFFF);
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index ccb0915e8..be2fa2fc0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -89,14 +89,15 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
+static void __head __noreturn
+sev_es_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
/* Tell the hypervisor what went wrong. */
val |= GHCB_SEV_TERM_REASON(set, reason);
- /* Request Guest Termination from Hypvervisor */
+ /* Request Guest Termination from Hypervisor */
sev_es_wr_ghcb_msr(val);
VMGEXIT();
@@ -326,13 +327,7 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid
*/
static const struct snp_cpuid_table *snp_cpuid_get_table(void)
{
- void *ptr;
-
- asm ("lea cpuid_table_copy(%%rip), %0"
- : "=r" (ptr)
- : "p" (&cpuid_table_copy));
-
- return ptr;
+ return &RIP_REL_REF(cpuid_table_copy);
}
/*
@@ -391,7 +386,7 @@ static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
return xsave_size;
}
-static bool
+static bool __head
snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -528,7 +523,8 @@ static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+static int __head
+snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -556,9 +552,9 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
/* Skip post-processing for out-of-range zero leafs. */
- if (!(leaf->fn <= cpuid_std_range_max ||
- (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
- (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max))))
return 0;
}
@@ -570,7 +566,7 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
* page yet, so it only supports the MSR based communication with the
* hypervisor and only the CPUID exit-code.
*/
-void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
{
unsigned int subfn = lower_bits(regs->cx, 32);
unsigned int fn = lower_bits(regs->ax, 32);
@@ -1016,7 +1012,8 @@ struct cc_setup_data {
* Search for a Confidential Computing blob passed in as a setup_data entry
* via the Linux Boot Protocol.
*/
-static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+static __head
+struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
{
struct cc_setup_data *sd = NULL;
struct setup_data *hdr;
@@ -1043,7 +1040,7 @@ static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
* mapping needs to be updated in sync with all the changes to virtual memory
* layout and related mapping facilities throughout the boot process.
*/
-static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
{
const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
int i;
@@ -1063,11 +1060,11 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
if (fn->eax_in == 0x0)
- cpuid_std_range_max = fn->eax;
+ RIP_REL_REF(cpuid_std_range_max) = fn->eax;
else if (fn->eax_in == 0x40000000)
- cpuid_hyp_range_max = fn->eax;
+ RIP_REL_REF(cpuid_hyp_range_max) = fn->eax;
else if (fn->eax_in == 0x80000000)
- cpuid_ext_range_max = fn->eax;
+ RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
}
}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index d87c6ff1f..eb2873b5e 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -23,8 +23,10 @@
#include <linux/platform_device.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/dmi.h>
#include <uapi/linux/sev-guest.h>
+#include <asm/init.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/sev.h>
@@ -682,8 +684,9 @@ static u64 __init get_jump_table_addr(void)
return ret;
}
-static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
- unsigned long npages, enum psc_op op)
+static void __head
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
{
unsigned long paddr_end;
u64 val;
@@ -739,7 +742,7 @@ e_term:
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
}
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
@@ -748,7 +751,7 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
@@ -767,28 +770,13 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* Ask hypervisor to mark the memory pages shared in the RMP table. */
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
}
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
-{
- unsigned long vaddr, npages;
-
- vaddr = (unsigned long)__va(paddr);
- npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
-
- if (op == SNP_PAGE_STATE_PRIVATE)
- early_snp_set_memory_private(vaddr, paddr, npages);
- else if (op == SNP_PAGE_STATE_SHARED)
- early_snp_set_memory_shared(vaddr, paddr, npages);
- else
- WARN(1, "invalid memory op %d\n", op);
-}
-
static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
unsigned long vaddr_end, int op)
{
@@ -966,7 +954,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
struct ghcb_state state;
@@ -2059,7 +2047,7 @@ fail:
*
* Scan for the blob in that order.
*/
-static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2085,7 +2073,7 @@ found_cc_info:
return cc_info;
}
-bool __init snp_init(struct boot_params *bp)
+bool __head snp_init(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2107,11 +2095,22 @@ bool __init snp_init(struct boot_params *bp)
return true;
}
-void __init __noreturn snp_abort(void)
+void __head __noreturn snp_abort(void)
{
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
}
+/*
+ * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
+ * enabled, as the alternative (fallback) logic for DMI probing in the legacy
+ * ROM region can cause a crash since this region is not pre-validated.
+ */
+void __init snp_dmi_setup(void)
+{
+ if (efi_enabled(EFI_CONFIG_TABLES))
+ dmi_setup();
+}
+
static void dump_cpuid_table(void)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094d..31b6f5ddd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2a187c0cb..3f57ce68a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -87,6 +87,7 @@
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
#include <asm/sev.h>
+#include <asm/spec-ctrl.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -124,7 +125,20 @@ struct mwait_cpu_dead {
*/
static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
-/* Logical package management. We might want to allocate that dynamically */
+/* Logical package management. */
+struct logical_maps {
+ u32 phys_pkg_id;
+ u32 phys_die_id;
+ u32 logical_pkg_id;
+ u32 logical_die_id;
+};
+
+/* Temporary workaround until the full topology mechanics is in place */
+static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = {
+ .phys_pkg_id = U32_MAX,
+ .phys_die_id = U32_MAX,
+};
+
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
@@ -258,12 +272,9 @@ static void notrace start_secondary(void *unused)
cpu_init_exception_handling();
/*
- * 32-bit systems load the microcode from the ASM startup code for
- * historical reasons.
- *
- * On 64-bit systems load it before reaching the AP alive
- * synchronization point below so it is not part of the full per
- * CPU serialized bringup part when "parallel" bringup is enabled.
+ * Load the microcode before reaching the AP alive synchronization
+ * point below so it is not part of the full per CPU serialized
+ * bringup part when "parallel" bringup is enabled.
*
* That's even safe when hyperthreading is enabled in the CPU as
* the core code starts the primary threads first and leaves the
@@ -276,8 +287,7 @@ static void notrace start_secondary(void *unused)
* CPUID, MSRs etc. must be strictly serialized to maintain
* software state correctness.
*/
- if (IS_ENABLED(CONFIG_X86_64))
- load_ucode_ap();
+ load_ucode_ap();
/*
* Synchronization point with the hotplug core. Sets this CPUs
@@ -288,7 +298,7 @@ static void notrace start_secondary(void *unused)
cpu_init();
fpu__init_cpu();
- rcu_cpu_starting(raw_smp_processor_id());
+ rcutree_report_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
ap_starting();
@@ -337,10 +347,8 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
int cpu;
for_each_possible_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && c->phys_proc_id == phys_pkg)
- return c->logical_proc_id;
+ if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg)
+ return per_cpu(logical_maps.logical_pkg_id, cpu);
}
return -1;
}
@@ -355,14 +363,12 @@ EXPORT_SYMBOL(topology_phys_to_logical_pkg);
*/
static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
{
- int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id;
+ int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id;
for_each_possible_cpu(cpu) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && c->cpu_die_id == die_id &&
- c->phys_proc_id == proc_id)
- return c->logical_die_id;
+ if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id &&
+ per_cpu(logical_maps.phys_die_id, cpu) == die_id)
+ return per_cpu(logical_maps.logical_die_id, cpu);
}
return -1;
}
@@ -387,7 +393,9 @@ int topology_update_package_map(unsigned int pkg, unsigned int cpu)
cpu, pkg, new);
}
found:
- cpu_data(cpu).logical_proc_id = new;
+ per_cpu(logical_maps.phys_pkg_id, cpu) = pkg;
+ per_cpu(logical_maps.logical_pkg_id, cpu) = new;
+ cpu_data(cpu).topo.logical_pkg_id = new;
return 0;
}
/**
@@ -410,7 +418,9 @@ int topology_update_die_map(unsigned int die, unsigned int cpu)
cpu, die, new);
}
found:
- cpu_data(cpu).logical_die_id = new;
+ per_cpu(logical_maps.phys_die_id, cpu) = die;
+ per_cpu(logical_maps.logical_die_id, cpu) = new;
+ cpu_data(cpu).topo.logical_die_id = new;
return 0;
}
@@ -421,8 +431,8 @@ static void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
- topology_update_package_map(c->phys_proc_id, id);
- topology_update_die_map(c->cpu_die_id, id);
+ topology_update_package_map(c->topo.pkg_id, id);
+ topology_update_die_map(c->topo.die_id, id);
c->initialized = true;
}
@@ -476,21 +486,21 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id &&
- per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
- if (c->cpu_core_id == o->cpu_core_id)
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
+ if (c->topo.core_id == o->topo.core_id)
return topology_sane(c, o, "smt");
- if ((c->cu_id != 0xff) &&
- (o->cu_id != 0xff) &&
- (c->cu_id == o->cu_id))
+ if ((c->topo.cu_id != 0xff) &&
+ (o->topo.cu_id != 0xff) &&
+ (c->topo.cu_id == o->topo.cu_id))
return topology_sane(c, o, "smt");
}
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id &&
- c->cpu_core_id == o->cpu_core_id) {
+ } else if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.core_id == o->topo.core_id) {
return topology_sane(c, o, "smt");
}
@@ -499,8 +509,8 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_die_id == o->cpu_die_id)
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id)
return true;
return false;
}
@@ -510,11 +520,11 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
/* If the arch didn't set up l2c_id, fall back to SMT */
- if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+ if (per_cpu_l2c_id(cpu1) == BAD_APICID)
return match_smt(c, o);
/* Do not match if L2 cache id does not match: */
- if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
return false;
return topology_sane(c, o, "l2c");
@@ -527,7 +537,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id)
+ if (c->topo.pkg_id == o->topo.pkg_id)
return true;
return false;
}
@@ -560,11 +570,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
bool intel_snc = id && id->driver_data;
/* Do not match if we do not have a valid APICID for cpu: */
- if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ if (per_cpu_llc_id(cpu1) == BAD_APICID)
return false;
/* Do not match if LLC id does not match: */
- if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
return false;
/*
@@ -640,13 +650,13 @@ static void __init build_sched_topology(void)
};
#endif
/*
- * When there is NUMA topology inside the package skip the DIE domain
+ * When there is NUMA topology inside the package skip the PKG domain
* since the NUMA domains will auto-magically create the right spanning
* domains based on the SLIT.
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
};
}
@@ -747,6 +757,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return cpu_l2c_shared_mask(cpu);
}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
static void impress_friends(void)
{
@@ -809,7 +820,7 @@ static void __init smp_quirk_init_udelay(void)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static void send_init_sequence(int phys_apicid)
+static void send_init_sequence(u32 phys_apicid)
{
int maxlvt = lapic_get_maxlvt();
@@ -835,7 +846,7 @@ static void send_init_sequence(int phys_apicid)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
{
unsigned long send_status = 0, accept_status = 0;
int num_starts, j, maxlvt;
@@ -982,7 +993,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if startup was successfully sent, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
{
unsigned long start_ip = real_mode_header->trampoline_start;
int ret;
@@ -1050,7 +1061,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
- int apicid = apic->cpu_present_to_apicid(cpu);
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
int err;
lockdep_assert_irqs_enabled();
@@ -1405,7 +1416,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
- c->cpu_core_id = 0;
+ c->topo.core_id = 0;
c->booted_cores = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
recompute_smt_state();
@@ -1596,8 +1607,15 @@ void __noreturn hlt_play_dead(void)
native_halt();
}
+/*
+ * native_play_dead() is essentially a __noreturn function, but it can't
+ * be marked as such as the compiler may complain about it.
+ */
void native_play_dead(void)
{
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ __update_spec_ctrl(0);
+
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0bab03130..d42c28b8b 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -35,38 +35,9 @@
#include <asm/io_apic.h>
#include <asm/cpu.h>
-static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
-
#ifdef CONFIG_HOTPLUG_CPU
-int arch_register_cpu(int cpu)
+bool arch_cpu_is_hotpluggable(int cpu)
{
- struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu);
-
- xc->cpu.hotpluggable = cpu > 0;
- return register_cpu(&xc->cpu, cpu);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void arch_unregister_cpu(int num)
-{
- unregister_cpu(&per_cpu(cpu_devices, num).cpu);
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-int __init arch_register_cpu(int num)
-{
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+ return cpu > 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init topology_init(void)
-{
- int i;
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
-
- return 0;
-}
-subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c876f1d36..c3b2f863a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
@@ -565,7 +566,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
*/
static bool try_fixup_enqcmd_gp(void)
{
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
u32 pasid;
/*
@@ -591,7 +592,7 @@ static bool try_fixup_enqcmd_gp(void)
if (!mm_valid_pasid(current->mm))
return false;
- pasid = current->mm->pasid;
+ pasid = mm_get_enqcmd_pasid(current->mm);
/*
* Did this thread already have its PASID activated?
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1123ef3cc..433403365 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;
/*
- * If a non-zero TSC value for socket 0 may be valid then the default
- * adjusted value cannot assumed to be zero either.
+ * The default adjust value cannot be assumed to be zero on any socket.
*/
- if (tsc_async_resets)
- cur->adjusted = bootval;
+ cur->adjusted = bootval;
/*
* Check whether this CPU is the first in a package to come up. In
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 7e574cf3b..d00c28aaa 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
{
int *first = ip_table;
int *last = ip_table + num_entries - 1;
- int *mid = first, *found = first;
+ int *mid, *found = first;
if (!num_entries)
return NULL;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f15fb71f2..a349dbfc6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -139,10 +139,7 @@ SECTIONS
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
-#ifdef CONFIG_CPU_SRSO
*(.text..__x86.rethunk_untrain)
-#endif
-
ENTRY_TEXT
#ifdef CONFIG_CPU_SRSO
@@ -270,19 +267,6 @@ SECTIONS
}
#endif
- /*
- * start address and size of operations which during runtime
- * can be patched with virtualization friendly instructions or
- * baremetal native ones. Think page table operations.
- * Details in paravirt_types.h
- */
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
#ifdef CONFIG_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
@@ -520,12 +504,12 @@ INIT_PER_CPU(irq_stack_backing_store);
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_CPU_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
-. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
#endif
#ifdef CONFIG_CPU_SRSO
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
* GNU ld cannot do XOR until 2.41.
* https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 65e96b76c..d3fc01770 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -127,7 +127,7 @@ static void __init vsmp_cap_cpus(void)
#endif
}
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return read_apic_id() >> index_msb;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a37ebd3b4..3f0718b4a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -3,6 +3,7 @@
*
* For licencing details see kernel-base/COPYING
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/export.h>
@@ -66,6 +67,7 @@ struct x86_init_ops x86_init __initdata = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
},
.mpparse = {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ed90f1481..65ed14b65 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,17 +23,15 @@ config KVM
depends on HAVE_KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
- select PREEMPT_NOTIFIERS
- select MMU_NOTIFIER
+ select KVM_COMMON
+ select KVM_GENERIC_MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
- select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
@@ -46,7 +44,6 @@ config KVM
select KVM_XFER_TO_GUEST_WORK
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
- select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
help
@@ -65,18 +62,31 @@ config KVM
config KVM_WERROR
bool "Compile KVM with -Werror"
- # KASAN may cause the build to fail due to larger frames
- default y if X86_64 && !KASAN
- # We use the dependency on !COMPILE_TEST to not be enabled
- # blindly in allmodconfig or allyesconfig configurations
- depends on KVM
- depends on (X86_64 && !KASAN) || !COMPILE_TEST
- depends on EXPERT
+ # Disallow KVM's -Werror if KASAN is enabled, e.g. to guard against
+ # randomized configs from selecting KVM_WERROR=y, which doesn't play
+ # nice with KASAN. KASAN builds generates warnings for the default
+ # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
+ # Building KVM with -Werror and KASAN is still doable via enabling
+ # the kernel-wide WERROR=y.
+ depends on KVM && EXPERT && !KASAN
help
Add -Werror to the build flags for KVM.
If in doubt, say "N".
+config KVM_SW_PROTECTED_VM
+ bool "Enable support for KVM software-protected VMs"
+ depends on EXPERT
+ depends on KVM && X86_64
+ select KVM_GENERIC_PRIVATE_MEM
+ help
+ Enable support for KVM software-protected VMs. Currently, software-
+ protected VMs are purely a development and testing vehicle for
+ KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
+ software-protected VM will fail miserably.
+
+ If unsure, say "N".
+
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
@@ -129,6 +139,20 @@ config KVM_SMM
If unsure, say Y.
+config KVM_HYPERV
+ bool "Support for Microsoft Hyper-V emulation"
+ depends on KVM
+ default y
+ help
+ Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
+ to expose a subset of the paravirtualized interfaces defined in the
+ Hyper-V Hypervisor Top-Level Functional Specification (TLFS):
+ https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ These interfaces are required for the correct and performant functioning
+ of Windows and Hyper-V guests on KVM.
+
+ If unsure, say "Y".
+
config KVM_XEN
bool "Support for Xen hypercall interface"
depends on KVM
@@ -154,4 +178,15 @@ config KVM_PROVE_MMU
config KVM_EXTERNAL_WRITE_TRACKING
bool
+config KVM_MAX_NR_VCPUS
+ int "Maximum number of vCPUs per KVM guest"
+ depends on KVM
+ range 1024 4096
+ default 4096 if MAXSMP
+ default 1024
+ help
+ Set the maximum number of vCPUs per KVM guest. Larger values will increase
+ the memory footprint of each KVM guest, regardless of how many vCPUs are
+ created for a given VM.
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 80e3fe184..475b5fa91 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -11,25 +11,27 @@ include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o \
mmu/spte.o
-ifdef CONFIG_HYPERV
-kvm-y += kvm_onhyperv.o
-endif
-
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
+ vmx/nested.o vmx/posted_intr.o
+
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
- svm/sev.o svm/hyperv.o
+ svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o
kvm-amd-y += svm/svm_onhyperv.o
endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 773132c3b..3a0227689 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -105,7 +105,7 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
/*
* If the index isn't significant, use the first entry with a
- * matching function. It's userspace's responsibilty to not
+ * matching function. It's userspace's responsibility to not
* provide "duplicate" entries in all cases.
*/
if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
@@ -314,11 +314,15 @@ EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
{
+#ifdef CONFIG_KVM_HYPERV
struct kvm_cpuid_entry2 *entry;
entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
KVM_CPUID_INDEX_NOT_SIGNIFICANT);
return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+#else
+ return false;
+#endif
}
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@@ -362,6 +366,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_update_pv_runtime(vcpu);
+ vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -433,11 +438,13 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
return 0;
}
+#ifdef CONFIG_KVM_HYPERV
if (kvm_cpuid_has_hyperv(e2, nent)) {
r = kvm_hv_vcpu_init(vcpu);
if (r)
return r;
}
+#endif
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
@@ -448,7 +455,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent;
vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+#ifdef CONFIG_KVM_XEN
vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
+#endif
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -467,7 +476,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
return -E2BIG;
if (cpuid->nent) {
- e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
+ e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e));
if (IS_ERR(e))
return PTR_ERR(e);
@@ -511,7 +520,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
return -E2BIG;
if (cpuid->nent) {
- e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent));
+ e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2));
if (IS_ERR(e2))
return PTR_ERR(e2);
}
@@ -669,7 +678,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_7_1_EAX,
F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
F(FZRM) | F(FSRS) | F(FSRC) |
- F(AMX_FP16) | F(AVX_IFMA)
+ F(AMX_FP16) | F(AVX_IFMA) | F(LAM)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
@@ -677,6 +686,11 @@ void kvm_set_cpu_caps(void)
F(AMX_COMPLEX)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
+ F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) |
+ F(BHI_CTRL) | F(MCDT_NO)
+ );
+
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
@@ -753,11 +767,13 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
- F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
+ F(WRMSR_XX_BASE_NS)
);
- if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
- kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
@@ -956,13 +972,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
/* function 7 has additional index. */
case 7:
- entry->eax = min(entry->eax, 1u);
+ max_idx = entry->eax = min(entry->eax, 2u);
cpuid_entry_override(entry, CPUID_7_0_EBX);
cpuid_entry_override(entry, CPUID_7_ECX);
cpuid_entry_override(entry, CPUID_7_EDX);
- /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
- if (entry->eax == 1) {
+ /* KVM only supports up to 0x7.2, capped above via min(). */
+ if (max_idx >= 1) {
entry = do_host_cpuid(array, function, 1);
if (!entry)
goto out;
@@ -972,6 +988,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = 0;
entry->ecx = 0;
}
+ if (max_idx >= 2) {
+ entry = do_host_cpuid(array, function, 2);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_2_EDX);
+ entry->ecx = 0;
+ entry->ebx = 0;
+ entry->eax = 0;
+ }
break;
case 0xa: { /* Architectural Performance Monitoring */
union cpuid10_eax eax;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 284fa4704..23dbb9eb2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -47,11 +47,6 @@ static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return !(gpa & vcpu->arch.reserved_gpa_bits);
}
-static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
-}
-
static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
gpa_t gpa, gpa_t alignment)
{
@@ -125,6 +120,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
}
+static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.is_amd_compatible;
+}
+
+static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu)
+{
+ return !guest_cpuid_is_amd_compatible(vcpu);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -174,7 +179,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
{
return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
- guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
}
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
@@ -278,4 +284,12 @@ static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
vcpu->arch.governed_features.enabled);
}
+static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (guest_can_use(vcpu, X86_FEATURE_LAM))
+ cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, cr3);
+}
+
#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index ee8c4c349..95ea1a1f7 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -111,7 +111,7 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
mutex_lock(&kvm->slots_lock);
write_lock(&kvm->mmu_lock);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
int bkt;
slots = __kvm_memslots(kvm, i);
@@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
}
static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
.open = kvm_mmu_rmaps_stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2673cd5c4..e223043ef 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -687,8 +687,8 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
unsigned *max_size, unsigned size,
- bool write, bool fetch,
- enum x86emul_mode mode, ulong *linear)
+ enum x86emul_mode mode, ulong *linear,
+ unsigned int flags)
{
struct desc_struct desc;
bool usable;
@@ -701,7 +701,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
- *linear = la;
+ *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags);
va_bits = ctxt_virt_addr_bits(ctxt);
if (!__is_canonical_address(la, va_bits))
goto bad;
@@ -717,11 +717,11 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
if (!usable)
goto bad;
/* code segment in protected mode or read-only data segment */
- if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
- || !(desc.type & 2)) && write)
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) &&
+ (flags & X86EMUL_F_WRITE))
goto bad;
/* unreadable code segment */
- if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
if (!(desc.type & 8) && (desc.type & 4)) {
@@ -757,8 +757,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
ulong *linear)
{
unsigned max_size;
- return __linearize(ctxt, addr, &max_size, size, write, false,
- ctxt->mode, linear);
+ return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear,
+ write ? X86EMUL_F_WRITE : 0);
}
static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
@@ -771,7 +771,8 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
if (ctxt->op_bytes != sizeof(unsigned long))
addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
- rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
if (rc == X86EMUL_CONTINUE)
ctxt->_eip = addr.ea;
return rc;
@@ -907,8 +908,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
* boundary check itself. Instead, we use max_size to check
* against op_size.
*/
- rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
- &linear);
+ rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
@@ -3439,8 +3440,10 @@ static int em_invlpg(struct x86_emulate_ctxt *ctxt)
{
int rc;
ulong linear;
+ unsigned int max_size;
- rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
+ rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode,
+ &linear, X86EMUL_F_INVLPG);
if (rc == X86EMUL_CONTINUE)
ctxt->ops->invlpg(ctxt, linear);
/* Disable writeback. */
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
index 423a73395..ad463b1ed 100644
--- a/arch/x86/kvm/governed_features.h
+++ b/arch/x86/kvm/governed_features.h
@@ -16,6 +16,7 @@ KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
KVM_GOVERNED_X86_FEATURE(VGIF)
KVM_GOVERNED_X86_FEATURE(VNMI)
+KVM_GOVERNED_X86_FEATURE(LAM)
#undef KVM_GOVERNED_X86_FEATURE
#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd733..8a47f8541 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1322,6 +1322,56 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
return false;
}
+#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839
+#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */
+
+/*
+ * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC
+ * configuration.
+ * Such configuration can result from, for example, AMD Erratum 1386 workaround.
+ *
+ * Print a notice so users aren't left wondering what's suddenly gone wrong.
+ */
+static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ /* Check again under the hv_lock. */
+ if (hv->xsaves_xsavec_checked)
+ return;
+
+ if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) !=
+ KVM_HV_WIN2016_GUEST_ID)
+ return;
+
+ hv->xsaves_xsavec_checked = true;
+
+ /* UP configurations aren't affected */
+ if (atomic_read(&kvm->online_vcpus) < 2)
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC))
+ return;
+
+ pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. "
+ "If it fails to boot try disabling XSAVEC in the VM config.\n");
+}
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!vcpu->arch.hyperv_enabled ||
+ hv->xsaves_xsavec_checked)
+ return;
+
+ mutex_lock(&hv->hv_lock);
+ __kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+ mutex_unlock(&hv->hv_lock);
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
@@ -2388,7 +2438,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
- eventfd_signal(eventfd, 1);
+ eventfd_signal(eventfd);
return HV_STATUS_SUCCESS;
}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index f83b8db72..923e64903 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,6 +24,8 @@
#include <linux/kvm_host.h>
#include "x86.h"
+#ifdef CONFIG_KVM_HYPERV
+
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -105,6 +107,17 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->vec_bitmap);
+}
+
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) &&
+ test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap);
+}
+
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
@@ -169,6 +182,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
void kvm_hv_request_tsc_page_update(struct kvm *kvm);
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu);
+
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
@@ -236,6 +251,77 @@ static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
return kvm_hv_get_assist_page(vcpu);
}
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu,
+ bool tdp_enabled)
+{
+ /*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && tdp_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+}
+
int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock) {}
+static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {}
+static inline void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) {}
+static inline void kvm_hv_init_vm(struct kvm *kvm) {}
+static inline void kvm_hv_destroy_vm(struct kvm *kvm) {}
+static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ return HV_STATUS_ACCESS_DENIED;
+}
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
+static inline void kvm_hv_free_pa_page(struct kvm *kvm) {}
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) {}
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) {}
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_idx;
+}
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {}
+#endif /* CONFIG_KVM_HYPERV */
-#endif
+#endif /* __ARCH_X86_KVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b2c397dd2..ad9ca8a60 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -118,8 +118,10 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
+#ifdef CONFIG_KVM_XEN
if (kvm_xen_has_interrupt(v))
return v->kvm->arch.xen.upcall_vector;
+#endif
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 16d076a1b..68f3f6c26 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -144,7 +144,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
-
+#ifdef CONFIG_KVM_HYPERV
static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -154,6 +154,7 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
}
+#endif
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -163,9 +164,11 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
int r;
switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
return kvm_hv_set_sint(e, kvm, irq_source_id, level,
line_status);
+#endif
case KVM_IRQ_ROUTING_MSI:
if (kvm_msi_route_invalid(kvm, e))
@@ -314,11 +317,13 @@ int kvm_set_routing_entry(struct kvm *kvm,
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
+#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
+#endif
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
return kvm_xen_setup_evtchn(kvm, e, ue);
@@ -438,5 +443,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
void kvm_arch_irq_routing_update(struct kvm *kvm)
{
+#ifdef CONFIG_KVM_HYPERV
kvm_hv_irq_routing_update(kvm);
+#endif
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index be7aeb9b8..e6d149825 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -88,6 +88,12 @@ struct x86_instruction_info {
#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+/* x86-specific emulation flags */
+#define X86EMUL_F_WRITE BIT(0)
+#define X86EMUL_F_FETCH BIT(1)
+#define X86EMUL_F_IMPLICIT BIT(2)
+#define X86EMUL_F_INVLPG BIT(3)
+
struct x86_emulate_ops {
void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
/*
@@ -224,6 +230,9 @@ struct x86_emulate_ops {
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
+
+ gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
};
/* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index f9ca3e743..eefab3dc8 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -10,6 +10,26 @@
int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
int hv_flush_remote_tlbs(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
+static inline hpa_t hv_get_partition_assist_page(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Partition assist page is something which Hyper-V running in L0
+ * requires from KVM running in L1 before direct TLB flush for L2
+ * guests can be enabled. KVM doesn't currently use the page but to
+ * comply with TLFS it still needs to be allocated. For now, this
+ * is a single page shared among all vCPUs.
+ */
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &vcpu->kvm->arch.hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+
+ if (!*p_hv_pa_pg)
+ return INVALID_PAGE;
+
+ return __pa(*p_hv_pa_pg);
+}
#else /* !CONFIG_HYPERV */
static inline int hv_flush_remote_tlbs(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 245b20973..76fcee92b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -499,8 +500,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -1475,8 +1478,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (to_hv_vcpu(apic->vcpu) &&
- test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
+ if (kvm_hv_synic_has_vector(apic->vcpu, vector))
kvm_hv_synic_send_eoi(apic->vcpu, vector);
kvm_ioapic_send_eoi(apic, vector);
@@ -2769,7 +2771,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
- if (r && lvt_type == APIC_LVTPC)
+ if (r && lvt_type == APIC_LVTPC &&
+ guest_cpuid_is_intel_compatible(apic->vcpu))
kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
return r;
}
@@ -2905,7 +2908,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
*/
apic_clear_irr(vector, apic);
- if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
+ if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) {
/*
* For auto-EOI interrupts, there might be another pending
* interrupt above PPR, so check whether to raise another
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 253fb2093..60f21bb4c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -146,6 +146,14 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
}
+static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu)
+{
+ if (!guest_can_use(vcpu, X86_FEATURE_LAM))
+ return 0;
+
+ return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+}
+
static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
{
u64 root_hpa = vcpu->arch.mmu->root.hpa;
@@ -237,6 +245,13 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
+bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
+
+static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
+{
+ return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
+}
+
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f7901cb4d..982cf41e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -271,15 +271,11 @@ static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
static inline bool kvm_available_flush_remote_tlbs_range(void)
{
+#if IS_ENABLED(CONFIG_HYPERV)
return kvm_x86_ops.flush_remote_tlbs_range;
-}
-
-int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
-{
- if (!kvm_x86_ops.flush_remote_tlbs_range)
- return -EOPNOTSUPP;
-
- return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+#else
+ return false;
+#endif
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -795,16 +791,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG BIT(31)
+
static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
- int i;
+ int old, i;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
+
+ old = linfo->disallow_lpage;
linfo->disallow_lpage += count;
- WARN_ON_ONCE(linfo->disallow_lpage < 0);
+ WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
}
}
@@ -987,7 +993,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
/*
* The head descriptor is empty. If there are no tail descriptors,
- * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * nullify the rmap head to mark the list as empty, else point the rmap
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
@@ -1382,7 +1388,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
- kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
@@ -2840,9 +2846,9 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
/*
* Recheck after taking the spinlock, a different vCPU
* may have since marked the page unsync. A false
- * positive on the unprotected check above is not
+ * negative on the unprotected check above is not
* possible as clearing sp->unsync _must_ hold mmu_lock
- * for write, i.e. unsync cannot transition from 0->1
+ * for write, i.e. unsync cannot transition from 1->0
* while this CPU holds mmu_lock for read (or write).
*/
if (READ_ONCE(sp->unsync))
@@ -3056,7 +3062,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
*
* There are several ways to safely use this helper:
*
- * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
* consuming it. In this case, mmu_lock doesn't need to be held during the
* lookup, but it does need to be held while checking the MMU notifier.
*
@@ -3120,7 +3126,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
if (pud_none(pud) || !pud_present(pud))
goto out;
- if (pud_large(pud)) {
+ if (pud_leaf(pud)) {
level = PG_LEVEL_1G;
goto out;
}
@@ -3137,9 +3143,9 @@ out:
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level)
+static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, int max_level, bool is_private)
{
struct kvm_lpage_info *linfo;
int host_level;
@@ -3151,6 +3157,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}
+ if (is_private)
+ return max_level;
+
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
@@ -3158,6 +3167,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
return min(host_level, max_level);
}
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ int max_level)
+{
+ bool is_private = kvm_slot_can_be_private(slot) &&
+ kvm_mem_is_private(kvm, gfn);
+
+ return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+}
+
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -3178,8 +3197,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->max_level);
+ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->max_level,
+ fault->is_private);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -3425,8 +3445,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
- u64 spte = 0ull;
- u64 *sptep = NULL;
+ u64 spte;
+ u64 *sptep;
uint retry_count = 0;
if (!page_fault_can_be_fast(fault))
@@ -3442,6 +3462,14 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ /*
+ * It's entirely possible for the mapping to have been zapped
+ * by a different task, but the root page should always be
+ * available as the vCPU holds a reference to its root(s).
+ */
+ if (WARN_ON_ONCE(!sptep))
+ spte = REMOVED_SPTE;
+
if (!is_shadow_present_pte(spte))
break;
@@ -3548,7 +3576,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_put_root(kvm, sp, false);
+ kvm_tdp_mmu_put_root(kvm, sp);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3731,7 +3759,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
kvm_page_track_write_tracking_enabled(kvm))
goto out_success;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(slot, bkt, slots) {
/*
@@ -3774,7 +3802,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
hpa_t root;
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
- root_gfn = root_pgd >> PAGE_SHIFT;
+ root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
mmu->root.hpa = kvm_mmu_get_dummy_root();
@@ -4251,6 +4279,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}
+static inline u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int max_order, r;
+
+ if (!kvm_slot_can_be_private(fault->slot)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+ &max_order);
+ if (r) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return r;
+ }
+
+ fault->max_level = min(kvm_max_level_for_order(max_order),
+ fault->max_level);
+ fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+
+ return RET_PF_CONTINUE;
+}
+
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -4283,6 +4360,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_EMULATE;
}
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (fault->is_private)
+ return kvm_faultin_pfn_private(vcpu, fault);
+
async = false;
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
@@ -4320,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
+ /*
+ * Check for a relevant mmu_notifier invalidation event before getting
+ * the pfn from the primary MMU, and before acquiring mmu_lock.
+ *
+ * For mmu_lock, if there is an in-progress invalidation and the kernel
+ * allows preemption, the invalidation task may drop mmu_lock and yield
+ * in response to mmu_lock being contended, which is *very* counter-
+ * productive as this vCPU can't actually make forward progress until
+ * the invalidation completes.
+ *
+ * Retrying now can also avoid unnessary lock contention in the primary
+ * MMU, as the primary MMU doesn't necessarily hold a single lock for
+ * the duration of the invalidation, i.e. faulting in a conflicting pfn
+ * can cause the invalidation to take longer by holding locks that are
+ * needed to complete the invalidation.
+ *
+ * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
+ * will never yield mmu_lock in response to contention, as this vCPU is
+ * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
+ * to detect retry guarantees the worst case latency for the vCPU.
+ */
+ if (fault->slot &&
+ mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ return RET_PF_RETRY;
+
ret = __kvm_faultin_pfn(vcpu, fault);
if (ret != RET_PF_CONTINUE)
return ret;
@@ -4330,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(!fault->slot))
return kvm_handle_noslot_fault(vcpu, fault, access);
+ /*
+ * Check again for a relevant mmu_notifier invalidation event purely to
+ * avoid contending mmu_lock. Most invalidations will be detected by
+ * the previous check, but checking is extremely cheap relative to the
+ * overall cost of failing to detect the invalidation until after
+ * mmu_lock is acquired.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
+ kvm_release_pfn_clean(fault->pfn);
+ return RET_PF_RETRY;
+ }
+
return RET_PF_CONTINUE;
}
@@ -4357,8 +4479,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
return true;
+ /*
+ * Check for a relevant mmu_notifier invalidation event one last time
+ * now that mmu_lock is held, as the "unsafe" checks performed without
+ * holding mmu_lock can get false negatives.
+ */
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+ mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4479,21 +4606,28 @@ out_unlock:
}
#endif
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
{
/*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping. If the host MTRRs are ignored by TDP
- * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA
- * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype
- * from the guest's MTRRs so that guest accesses to memory that is
- * DMA'd aren't cached against the guest's wishes.
+ * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
+ * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
+ * to honor the memtype from the guest's MTRRs so that guest accesses
+ * to memory that is DMA'd aren't cached against the guest's wishes.
*
* Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
* e.g. KVM will force UC memtype for host MMIO.
*/
- if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ return vm_has_noncoherent_dma && shadow_memtype_mask;
+}
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ /*
+ * If the guest's MTRRs may be used to compute the "real" memtype,
+ * restrict the mapping level to ensure KVM uses a consistent memtype
+ * across the entire mapping.
+ */
+ if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
gfn_t base = gfn_round_for_level(fault->gfn,
@@ -4788,7 +4922,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
context->cpu_role.base.level, is_efer_nx(context),
guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
- guest_cpuid_is_amd_or_hygon(vcpu));
+ guest_cpuid_is_amd_compatible(vcpu));
}
static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
@@ -6213,7 +6347,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (!kvm_memslots_have_rmaps(kvm))
return flush;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
@@ -6245,7 +6379,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+ kvm_mmu_invalidate_begin(kvm);
+
+ kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
@@ -6255,7 +6391,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (flush)
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
- kvm_mmu_invalidate_end(kvm, 0, -1ul);
+ kvm_mmu_invalidate_end(kvm);
write_unlock(&kvm->mmu_lock);
}
@@ -6529,7 +6665,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
/*
- * A TLB flush is unnecessary at this point for the same resons as in
+ * A TLB flush is unnecessary at this point for the same reasons as in
* kvm_mmu_slot_try_split_huge_pages().
*/
}
@@ -6708,7 +6844,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* modifier prior to checking for a wrap of the MMIO generation so
* that a wrap in any address space is detected.
*/
- gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
/*
* The very rare case: if the MMIO generation number has wrapped,
@@ -6785,11 +6921,7 @@ static unsigned long mmu_shrink_count(struct shrinker *shrink,
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
-static struct shrinker mmu_shrinker = {
- .count_objects = mmu_shrink_count,
- .scan_objects = mmu_shrink_scan,
- .seeks = DEFAULT_SEEKS * 10,
-};
+static struct shrinker *mmu_shrinker;
static void mmu_destroy_caches(void)
{
@@ -6922,10 +7054,16 @@ int kvm_mmu_vendor_module_init(void)
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto out;
- ret = register_shrinker(&mmu_shrinker, "x86-mmu");
- if (ret)
+ mmu_shrinker = shrinker_alloc(0, "x86-mmu");
+ if (!mmu_shrinker)
goto out_shrinker;
+ mmu_shrinker->count_objects = mmu_shrink_count;
+ mmu_shrinker->scan_objects = mmu_shrink_scan;
+ mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
+
+ shrinker_register(mmu_shrinker);
+
return 0;
out_shrinker:
@@ -6947,7 +7085,7 @@ void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
+ shrinker_free(mmu_shrinker);
}
/*
@@ -7159,3 +7297,164 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
if (kvm->arch.nx_huge_page_recovery_thread)
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ /*
+ * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
+ * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+ * can simply ignore such slots. But if userspace is making memory
+ * PRIVATE, then KVM must prevent the guest from accessing the memory
+ * as shared. And if userspace is making memory SHARED and this point
+ * is reached, then at least one page within the range was previously
+ * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+ * Zapping SPTEs in this case ensures KVM will reassess whether or not
+ * a hugepage can be used for affected ranges.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, int level, unsigned long attrs)
+{
+ const unsigned long start = gfn;
+ const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+ if (level == PG_LEVEL_2M)
+ return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+ if (hugepage_test_mixed(slot, gfn, level - 1) ||
+ attrs != kvm_get_memory_attributes(kvm, gfn))
+ return false;
+ }
+ return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ unsigned long attrs = range->arg.attributes;
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Calculate which ranges can be mapped with hugepages even if the slot
+ * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
+ * a range that has PRIVATE GFNs, and conversely converting a range to
+ * SHARED may now allow hugepages.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ /*
+ * The sequence matters here: upper levels consume the result of lower
+ * level's scanning.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn = gfn_round_for_level(range->start, level);
+
+ /* Process the head page if it straddles the range. */
+ if (gfn != range->start || gfn + nr_pages > range->end) {
+ /*
+ * Skip mixed tracking if the aligned gfn isn't covered
+ * by the memslot, KVM can't use a hugepage due to the
+ * misaligned address regardless of memory attributes.
+ */
+ if (gfn >= slot->base_gfn &&
+ gfn + nr_pages <= slot->base_gfn + slot->npages) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ gfn += nr_pages;
+ }
+
+ /*
+ * Pages entirely covered by the range are guaranteed to have
+ * only the attributes which were just set.
+ */
+ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+ hugepage_clear_mixed(slot, gfn, level);
+
+ /*
+ * Process the last tail page if it straddles the range and is
+ * contained by the memslot. Like the head page, KVM can't
+ * create a hugepage if the slot size is misaligned.
+ */
+ if (gfn < range->end &&
+ (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+ return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ int level;
+
+ if (!kvm_arch_has_private_mem(kvm))
+ return;
+
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ /*
+ * Don't bother tracking mixed attributes for pages that can't
+ * be huge due to alignment, i.e. process only pages that are
+ * entirely contained by the memslot.
+ */
+ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+ gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn;
+
+ if (start < slot->base_gfn)
+ start += nr_pages;
+
+ /*
+ * Unlike setting attributes, every potential hugepage needs to
+ * be manually checked as the attributes may already be mixed.
+ */
+ for (gfn = start; gfn < end; gfn += nr_pages) {
+ unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index decc1f153..0669a8a66 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -13,6 +13,7 @@
#endif
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
#define __PT_LEVEL_SHIFT(level, bits_per_level) \
(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
#define __PT_INDEX(address, level, bits_per_level) \
@@ -201,6 +202,7 @@ struct kvm_page_fault {
/* Derived from mmu and global state. */
const bool is_tdp;
+ const bool is_private;
const bool nx_huge_page_workaround_enabled;
/*
@@ -296,6 +298,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
+ .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
};
int r;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index c85255073..4d4e98fe4 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -62,7 +62,7 @@
#endif
/* Common logic, but per-type values. These also need to be undefined. */
-#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK)
#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index bd30ebfb2..04c247bfe 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -146,7 +146,7 @@ static bool try_step_up(struct tdp_iter *iter)
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
- * highter GFN.
+ * higher GFN.
*
* The basic algorithm is as follows:
* 1. If the current SPTE is a non-last-level SPTE, step down into the page
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6cd4dd631..953082bf9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -73,11 +73,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
- bool shared)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
@@ -106,10 +103,16 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
- bool shared, bool only_valid)
+ bool only_valid)
{
struct kvm_mmu_page *next_root;
+ /*
+ * While the roots themselves are RCU-protected, fields such as
+ * role.invalid are protected by mmu_lock.
+ */
+ lockdep_assert_held(&kvm->mmu_lock);
+
rcu_read_lock();
if (prev_root)
@@ -132,7 +135,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
rcu_read_unlock();
if (prev_root)
- kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+ kvm_tdp_mmu_put_root(kvm, prev_root);
return next_root;
}
@@ -144,26 +147,22 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* recent root. (Unless keeping a live reference is desirable.)
*
* If shared is set, this function is operating under the MMU lock in read
- * mode. In the unlikely event that this thread must free a root, the lock
- * will be temporarily dropped and reacquired in write mode.
+ * mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
- for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
- if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
- kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
} else
-#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
- for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
- _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
- if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
- } else
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, false))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -276,28 +275,18 @@ static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
*
* @kvm: kvm instance
* @sp: the page to be removed
- * @shared: This operation may not be running under the exclusive use of
- * the MMU lock and the operation must synchronize with other
- * threads that might be adding or removing pages.
*/
-static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared)
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
tdp_unaccount_mmu_page(kvm, sp);
if (!sp->nx_huge_page_disallowed)
return;
- if (shared)
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- else
- lockdep_assert_held_write(&kvm->mmu_lock);
-
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
sp->nx_huge_page_disallowed = false;
untrack_possible_nx_huge_page(kvm, sp);
-
- if (shared)
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
/**
@@ -326,7 +315,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
trace_kvm_mmu_prepare_zap_page(sp);
- tdp_mmu_unlink_sp(kvm, sp, shared);
+ tdp_mmu_unlink_sp(kvm, sp);
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
tdp_ptep_t sptep = pt + i;
@@ -832,7 +821,8 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@@ -854,7 +844,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
- for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root)
tdp_mmu_zap_root(kvm, root, false);
}
@@ -868,7 +859,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
read_lock(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
continue;
@@ -891,7 +882,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the root must be reachable by mmu_notifiers while it's being
* zapped
*/
- kvm_tdp_mmu_put_root(kvm, root, true);
+ kvm_tdp_mmu_put_root(kvm, root);
}
read_unlock(&kvm->mmu_lock);
@@ -1125,7 +1116,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
@@ -1314,7 +1305,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
- for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
@@ -1346,6 +1337,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
{
struct kvm_mmu_page *sp;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
/*
* Since we are allocating while under the MMU lock we have to be
* careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1496,16 +1489,25 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
int r = 0;
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
- for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
if (r) {
- kvm_tdp_mmu_put_root(kvm, root, shared);
+ kvm_tdp_mmu_put_root(kvm, root);
break;
}
}
}
+static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
+{
+ /*
+ * All TDP MMU shadow pages share the same role as their root, aside
+ * from level, so it is valid to key off any shadow page to determine if
+ * write protection is needed for an entire tree.
+ */
+ return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1516,21 +1518,23 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
- u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
+ const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
bool spte_set = false;
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
continue;
- if (!is_shadow_present_pte(iter.old_spte))
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
@@ -1560,8 +1564,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_read(&kvm->mmu_lock);
-
- for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
@@ -1578,8 +1581,8 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -1591,7 +1594,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
@@ -1695,8 +1698,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
-
- for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
zap_collapsible_spte_range(kvm, root, slot);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 733a3aef3..20d97aa46 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -17,8 +17,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
- bool shared);
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 3eb6e7f47..a67c28a56 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -320,7 +320,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
gfn_t start, end;
- if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
return;
if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index dc8e8e907..2ab2d5213 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -127,9 +127,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
/*
- * Ignore overflow events for counters that are scheduled to be
- * reprogrammed, e.g. if a PMI for the previous event races with KVM's
- * handling of a related guest WRMSR.
+ * Ignore asynchronous overflow events for counters that are scheduled
+ * to be reprogrammed, e.g. if a PMI for the previous event races with
+ * KVM's handling of a related guest WRMSR.
*/
if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
return;
@@ -161,6 +161,15 @@ static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
return 1;
}
+static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
+{
+ u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+
+ if (!sample_period)
+ sample_period = pmc_bitmask(pmc) + 1;
+ return sample_period;
+}
+
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
bool exclude_user, bool exclude_kernel,
bool intr)
@@ -215,17 +224,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
return 0;
}
-static void pmc_pause_counter(struct kvm_pmc *pmc)
+static bool pmc_pause_counter(struct kvm_pmc *pmc)
{
u64 counter = pmc->counter;
-
- if (!pmc->perf_event || pmc->is_paused)
- return;
+ u64 prev_counter;
/* update counter, reset event value to avoid redundant accumulation */
- counter += perf_event_pause(pmc->perf_event, true);
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_pause(pmc->perf_event, true);
+
+ /*
+ * Snapshot the previous counter *after* accumulating state from perf.
+ * If overflow already happened, hardware (via perf) is responsible for
+ * generating a PMI. KVM just needs to detect overflow on emulated
+ * counter events that haven't yet been processed.
+ */
+ prev_counter = counter & pmc_bitmask(pmc);
+
+ counter += pmc->emulated_counter;
pmc->counter = counter & pmc_bitmask(pmc);
+
+ pmc->emulated_counter = 0;
pmc->is_paused = true;
+
+ return pmc->counter < prev_counter;
}
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -268,6 +290,33 @@ static void pmc_stop_counter(struct kvm_pmc *pmc)
}
}
+static void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
+{
+ /*
+ * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
+ * read-modify-write. Adjust the counter value so that its value is
+ * relative to the current count, as reading the current count from
+ * perf is faster than pausing and repgrogramming the event in order to
+ * reset it to '0'. Note, this very sneakily offsets the accumulated
+ * emulated count too, by using pmc_read_counter()!
+ */
+ pmc->emulated_counter = 0;
+ pmc->counter += val - pmc_read_counter(pmc);
+ pmc->counter &= pmc_bitmask(pmc);
+ pmc_update_sample_period(pmc);
+}
+EXPORT_SYMBOL_GPL(pmc_write_counter);
+
static int filter_cmp(const void *pa, const void *pb, u64 mask)
{
u64 a = *(u64 *)pa & mask;
@@ -401,14 +450,15 @@ static void reprogram_counter(struct kvm_pmc *pmc)
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
u64 new_config = eventsel;
+ bool emulate_overflow;
u8 fixed_ctr_ctrl;
- pmc_pause_counter(pmc);
+ emulate_overflow = pmc_pause_counter(pmc);
if (!pmc_event_is_allowed(pmc))
goto reprogram_complete;
- if (pmc->counter < pmc->prev_counter)
+ if (emulate_overflow)
__kvm_perf_overflow(pmc, false);
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -448,7 +498,6 @@ static void reprogram_counter(struct kvm_pmc *pmc)
reprogram_complete:
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
- pmc->prev_counter = 0;
}
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
@@ -657,7 +706,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
@@ -674,6 +723,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
pmc->counter = 0;
+ pmc->emulated_counter = 0;
if (pmc_is_gp(pmc))
pmc->eventsel = 0;
@@ -691,6 +741,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
return;
@@ -700,8 +752,34 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
*/
kvm_pmu_reset(vcpu);
- bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ pmu->version = 0;
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_status_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
+ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return;
+
static_call(kvm_x86_pmu_refresh)(vcpu);
+
+ /*
+ * At RESET, both Intel and AMD CPUs set all enable bits for general
+ * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
+ * was written for v1 PMUs don't unknowingly leave GP counters disabled
+ * in the global controls). Emulate that behavior when refreshing the
+ * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
+ */
+ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
+ pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -710,8 +788,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
memset(pmu, 0, sizeof(*pmu));
static_call(kvm_x86_pmu_init)(vcpu);
- pmu->event_count = 0;
- pmu->need_cleanup = false;
kvm_pmu_refresh(vcpu);
}
@@ -747,8 +823,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
- pmc->prev_counter = pmc->counter;
- pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+ pmc->emulated_counter++;
kvm_pmu_request_counter_reprogram(pmc);
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a46aa9b25..7caeb3d8d 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -66,7 +66,8 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
{
u64 counter, enabled, running;
- counter = pmc->counter;
+ counter = pmc->counter + pmc->emulated_counter;
+
if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_read_value(pmc->perf_event,
&enabled, &running);
@@ -74,11 +75,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
return counter & pmc_bitmask(pmc);
}
-static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
-{
- pmc->counter += val - pmc_read_counter(pmc);
- pmc->counter &= pmc_bitmask(pmc);
-}
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val);
static inline bool pmc_is_gp(struct kvm_pmc *pmc)
{
@@ -128,25 +125,6 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
return NULL;
}
-static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
-{
- u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
-
- if (!sample_period)
- sample_period = pmc_bitmask(pmc) + 1;
- return sample_period;
-}
-
-static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
-{
- if (!pmc->perf_event || pmc->is_paused ||
- !is_sampling_event(pmc->perf_event))
- return;
-
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, pmc->counter));
-}
-
static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -243,7 +221,6 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
-void kvm_pmu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index b81650678..2f4e15508 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -16,6 +16,7 @@ enum kvm_only_cpuid_leafs {
CPUID_7_1_EDX,
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
+ CPUID_7_2_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,14 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
+#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
+#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1)
+#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
+#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3)
+#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
+#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -80,6 +89,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
+ [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
};
/*
@@ -92,10 +102,12 @@ static const struct cpuid_reg reverse_cpuid[] = {
*/
static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
{
+ BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
}
@@ -106,18 +118,20 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
*/
static __always_inline u32 __feature_translate(int x86_feature)
{
- if (x86_feature == X86_FEATURE_SGX1)
- return KVM_X86_FEATURE_SGX1;
- else if (x86_feature == X86_FEATURE_SGX2)
- return KVM_X86_FEATURE_SGX2;
- else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
- return KVM_X86_FEATURE_SGX_EDECCSSA;
- else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
- return KVM_X86_FEATURE_CONSTANT_TSC;
- else if (x86_feature == X86_FEATURE_PERFMON_V2)
- return KVM_X86_FEATURE_PERFMON_V2;
-
- return x86_feature;
+#define KVM_X86_TRANSLATE_FEATURE(f) \
+ case X86_FEATURE_##f: return KVM_X86_FEATURE_##f
+
+ switch (x86_feature) {
+ KVM_X86_TRANSLATE_FEATURE(SGX1);
+ KVM_X86_TRANSLATE_FEATURE(SGX2);
+ KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA);
+ KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC);
+ KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
+ KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ default:
+ return x86_feature;
+ }
}
static __always_inline u32 __feature_leaf(int x86_feature)
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index b42111a24..dc3d95fdc 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu)
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
static_call(kvm_x86_set_cr0)(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
static_call(kvm_x86_set_cr4)(vcpu, 0);
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
index 02f4784b5..d3f8bfc05 100644
--- a/arch/x86/kvm/svm/hyperv.h
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -11,6 +11,7 @@
#include "../hyperv.h"
#include "svm.h"
+#ifdef CONFIG_KVM_HYPERV
static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -41,5 +42,13 @@ static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
}
void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {}
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) {}
+#endif /* CONFIG_KVM_HYPERV */
#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 60891b9ce..dee62362a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -187,7 +187,6 @@ void recalc_intercepts(struct vcpu_svm *svm)
*/
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
int i;
/*
@@ -198,11 +197,16 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
* - Nested hypervisor (L1) is using Hyper-V emulation interface and
* tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- if (!svm->nested.force_msr_bitmap_recalc &&
- kvm_hv_hypercall_enabled(&svm->vcpu) &&
- hve->hv_enlightenments_control.msr_bitmap &&
- (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
- goto set_msrpm_base_pa;
+#ifdef CONFIG_KVM_HYPERV
+ if (!svm->nested.force_msr_bitmap_recalc) {
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+
+ if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
+ }
+#endif
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
@@ -230,7 +234,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.force_msr_bitmap_recalc = false;
+#ifdef CONFIG_KVM_HYPERV
set_msrpm_base_pa:
+#endif
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
@@ -296,7 +302,7 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
if (CC(!(save->cr4 & X86_CR4_PAE)) ||
CC(!(save->cr0 & X86_CR0_PE)) ||
- CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
return false;
}
@@ -363,12 +369,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
to->msrpm_base_pa &= ~0x0fffULL;
to->iopm_base_pa &= ~0x0fffULL;
+#ifdef CONFIG_KVM_HYPERV
/* Hyper-V extensions (Enlightened VMCB) */
if (kvm_hv_hypercall_enabled(vcpu)) {
to->clean = from->clean;
memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
sizeof(to->hv_enlightenments));
}
+#endif
}
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
@@ -472,14 +480,8 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
- /*
- * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
- * L2's VP_ID upon request from the guest. Make sure we check for
- * pending entries in the right FIFO upon L1/L2 transition as these
- * requests are put by other vCPUs asynchronously.
- */
- if (to_hv_vcpu(vcpu) && npt_enabled)
- kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+ /* Handle pending Hyper-V TLB flush requests */
+ kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled);
/*
* TODO: optimize unconditional TLB flush/MMU sync. A partial list of
@@ -505,7 +507,7 @@ static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt, bool reload_pdptrs)
{
- if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
+ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3)))
return -EINVAL;
if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 3fd47de14..b6a7ad4d6 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -161,7 +161,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc_write_counter(pmc, data);
- pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4900c0780..86088d125 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -57,7 +57,7 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = true;
+static bool sev_es_debug_swap_enabled = false;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
#else
#define sev_enabled false
@@ -84,9 +84,10 @@ struct enc_region {
};
/* Called with the sev_bitmap_lock held, or on shutdown */
-static int sev_flush_asids(int min_asid, int max_asid)
+static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
{
- int ret, asid, error = 0;
+ int ret, error = 0;
+ unsigned int asid;
/* Check if there are any ASIDs to reclaim before performing a flush */
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
@@ -116,7 +117,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm)
}
/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(int min_asid, int max_asid)
+static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
{
if (sev_flush_asids(min_asid, max_asid))
return false;
@@ -143,8 +144,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
static int sev_asid_new(struct kvm_sev_info *sev)
{
- int asid, min_asid, max_asid, ret;
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ * Note: min ASID can end up larger than the max if basic SEV support is
+ * effectively disabled by disallowing use of ASIDs for SEV guests.
+ */
+ unsigned int min_asid = sev->es_active ? 1 : min_sev_asid;
+ unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
+ unsigned int asid;
bool retry = true;
+ int ret;
+
+ if (min_asid > max_asid)
+ return -ENOTTY;
WARN_ON(sev->misc_cg);
sev->misc_cg = get_current_misc_cg();
@@ -157,12 +170,6 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_lock(&sev_bitmap_lock);
- /*
- * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
- * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
- */
- min_asid = sev->es_active ? 1 : min_sev_asid;
- max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
if (asid > max_asid) {
@@ -187,7 +194,7 @@ e_uncharge:
return ret;
}
-static int sev_get_asid(struct kvm *kvm)
+static unsigned int sev_get_asid(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -284,8 +291,8 @@ e_no_asid:
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
+ unsigned int asid = sev_get_asid(kvm);
struct sev_data_activate activate;
- int asid = sev_get_asid(kvm);
int ret;
/* activate ASID on the given handle */
@@ -612,8 +619,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled)
+ if (sev_es_debug_swap_enabled) {
save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+ pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
+ "This will not work starting with Linux 6.10\n");
+ }
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -1975,20 +1985,22 @@ int sev_mem_enc_register_region(struct kvm *kvm,
goto e_free;
}
- region->uaddr = range->addr;
- region->size = range->size;
-
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
* flushed to ensure that guest data gets written into memory with
- * correct C-bit.
+ * correct C-bit. Note, this must be done before dropping kvm->lock,
+ * as region and its array of pages can be freed by a different task
+ * once kvm->lock is released.
*/
sev_clflush_pages(region->pages, region->npages);
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
return ret;
e_free:
@@ -2191,10 +2203,13 @@ void __init sev_hardware_setup(void)
/*
* SEV must obviously be supported in hardware. Sanity check that the
* CPU supports decode assists, which is mandatory for SEV guests to
- * support instruction emulation.
+ * support instruction emulation. Ditto for flushing by ASID, as SEV
+ * guests are bound to a single ASID, i.e. KVM can't rotate to a new
+ * ASID to effect a TLB flush.
*/
if (!boot_cpu_has(X86_FEATURE_SEV) ||
- WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
goto out;
/* Retrieve SEV CPUID information */
@@ -2229,8 +2244,10 @@ void __init sev_hardware_setup(void)
goto out;
}
- sev_asid_count = max_sev_asid - min_sev_asid + 1;
- WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ if (min_sev_asid <= max_sev_asid) {
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ }
sev_supported = true;
/* SEV-ES support requested? */
@@ -2261,7 +2278,9 @@ void __init sev_hardware_setup(void)
out:
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
- sev_supported ? "enabled" : "disabled",
+ sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
+ "unusable" :
+ "disabled",
min_sev_asid, max_sev_asid);
if (boot_cpu_has(X86_FEATURE_SEV_ES))
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
@@ -2309,7 +2328,7 @@ int sev_cpu_init(struct svm_cpu_data *sd)
*/
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
- int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+ unsigned int asid = sev_get_asid(vcpu->kvm);
/*
* Note! The address must be a kernel address, as regular page walk
@@ -2627,7 +2646,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
- int asid = sev_get_asid(svm->vcpu.kvm);
+ unsigned int asid = sev_get_asid(svm->vcpu.kvm);
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -2972,6 +2991,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
}
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * guest_can_use() checks a number of requirements on the host/guest to
+ * ensure that MSR_IA32_XSS is available, but it might report true even
+ * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
+ * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
+ * to further check that the guest CPUID actually supports
+ * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
+ * guests will still get intercepted and caught in the normal
+ * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
+ */
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
+ else
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 77f1eeefc..e90b429c8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_IA32_XSS, .always = false },
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
@@ -199,7 +200,7 @@ module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
-module_param(nested, int, S_IRUGO);
+module_param(nested, int, 0444);
/* enable/disable Next RIP Save */
int nrips = true;
@@ -364,8 +365,6 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
}
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len);
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
bool commit_side_effects)
@@ -386,14 +385,6 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
}
if (!svm->next_rip) {
- /*
- * FIXME: Drop this when kvm_emulate_instruction() does the
- * right thing and treats "can't emulate" as outright failure
- * for EMULTYPE_SKIP.
- */
- if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
- return 0;
-
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
@@ -531,8 +522,6 @@ static bool __kvm_is_svm_supported(void)
int cpu = smp_processor_id();
struct cpuinfo_x86 *c = &cpu_data(cpu);
- u64 vm_cr;
-
if (c->x86_vendor != X86_VENDOR_AMD &&
c->x86_vendor != X86_VENDOR_HYGON) {
pr_err("CPU %d isn't AMD or Hygon\n", cpu);
@@ -549,12 +538,6 @@ static bool __kvm_is_svm_supported(void)
return false;
}
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
- pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
- return false;
- }
-
return true;
}
@@ -2204,12 +2187,6 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The VM save area has already been encrypted so it
- * cannot be reinitialized - just terminate.
- */
- if (sev_es_guest(vcpu->kvm))
- return -EINVAL;
/*
* VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
@@ -2218,9 +2195,14 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
* userspace. At a platform view, INIT is acceptable behavior as
* there exist bare metal platforms that automatically INIT the CPU
* in response to shutdown.
+ *
+ * The VM save area for SEV-ES guests has already been encrypted so it
+ * cannot be reinitialized, i.e. synthesizing INIT is futile.
*/
- clear_page(svm->vmcb);
- kvm_vcpu_reset(vcpu, true);
+ if (!sev_es_guest(vcpu->kvm)) {
+ clear_page(svm->vmcb);
+ kvm_vcpu_reset(vcpu, true);
+ }
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -3581,8 +3563,15 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
if (svm->nmi_l1_to_l2)
return;
- svm->nmi_masked = true;
- svm_set_iret_intercept(svm);
+ /*
+ * No need to manually track NMI masking when vNMI is enabled, hardware
+ * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
+ * case where software directly injects an NMI.
+ */
+ if (!is_vnmi_enabled(svm)) {
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ }
++vcpu->stat.nmi_injections;
}
@@ -4729,15 +4718,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
#endif
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
if (!sev_guest(vcpu->kvm))
- return true;
+ return X86EMUL_CONTINUE;
/* #UD and #GP should never be intercepted for SEV guests. */
WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
@@ -4749,20 +4738,20 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
- return false;
+ return X86EMUL_RETRY_INSTR;
/*
* Emulation is possible if the instruction is already decoded, e.g.
* when completing I/O after returning from userspace.
*/
if (emul_type & EMULTYPE_NO_DECODE)
- return true;
+ return X86EMUL_CONTINUE;
/*
* Emulation is possible for SEV guests if and only if a prefilled
* buffer containing the bytes of the intercepted instruction is
* available. SEV guest memory is encrypted with a guest specific key
- * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
* decode garbage.
*
* If KVM is NOT trying to simply skip an instruction, inject #UD if
@@ -4782,9 +4771,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* success (and in practice it will work the vast majority of the time).
*/
if (unlikely(!insn)) {
- if (!(emul_type & EMULTYPE_SKIP))
- kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ if (emul_type & EMULTYPE_SKIP)
+ return X86EMUL_UNHANDLEABLE;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
}
/*
@@ -4795,7 +4786,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* table used to translate CS:RIP resides in emulated MMIO.
*/
if (likely(insn_len))
- return true;
+ return X86EMUL_CONTINUE;
/*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
@@ -4853,6 +4844,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
kvm_inject_gp(vcpu, 0);
else
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return X86EMUL_PROPAGATE_FAULT;
}
resume_guest:
@@ -4870,7 +4862,7 @@ resume_guest:
* doesn't explicitly define "ignored", i.e. doing nothing and letting
* the guest spin is technically "ignoring" the access.
*/
- return false;
+ return X86EMUL_RETRY_INSTR;
}
static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
@@ -5030,7 +5022,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
- .can_emulate_instruction = svm_can_emulate_instruction,
+ .check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
@@ -5094,6 +5086,13 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVM);
kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+ /*
+ * KVM currently flushes TLBs on *every* nested SVM transition,
+ * and so for all intents and purposes KVM supports flushing by
+ * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
+ */
+ kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
+
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index be67ab7fd..8ef95139c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -30,7 +30,7 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 46
+#define MAX_DIRECT_ACCESS_MSRS 47
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -148,7 +148,9 @@ struct vmcb_ctrl_area_cached {
u64 virt_ext;
u32 clean;
union {
+#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
+#endif
u8 reserved_sw[32];
};
};
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 7af8422d3..3971b3ea5 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -18,18 +18,14 @@
int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_vmcb_enlightenments *hve;
- struct hv_partition_assist_pg **p_hv_pa_pg =
- &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
- if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
-
- if (!*p_hv_pa_pg)
+ if (partition_assist_page == INVALID_PAGE)
return -ENOMEM;
hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
- hve->partition_assist_page = __pa(*p_hv_pa_pg);
+ hve->partition_assist_page = partition_assist_page;
hve->hv_vm_id = (unsigned long)vcpu->kvm;
if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
hve->hv_enlightenments_control.nested_flush_hypercall = 1;
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 36c8af87a..4e725854c 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -8,7 +8,7 @@
#define svm_asm(insn, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ asm goto("1: " __stringify(insn) "\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
::: clobber : fault); \
return; \
@@ -18,7 +18,7 @@ fault: \
#define svm_asm1(insn, op1, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1 : clobber : fault); \
return; \
@@ -28,7 +28,7 @@ fault: \
#define svm_asm2(insn, op1, op2, clobber...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
:: op1, op2 : clobber : fault); \
return; \
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index ef2ebabb0..9499f9c6b 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -270,16 +270,16 @@ SYM_FUNC_START(__svm_vcpu_run)
RESTORE_GUEST_SPEC_CTRL_BODY
RESTORE_HOST_SPEC_CTRL_BODY
-10: cmpb $0, kvm_rebooting
+10: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 2b
ud2
-30: cmpb $0, kvm_rebooting
+30: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 4b
ud2
-50: cmpb $0, kvm_rebooting
+50: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 6b
ud2
-70: cmpb $0, kvm_rebooting
+70: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 8b
ud2
@@ -381,7 +381,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
RESTORE_GUEST_SPEC_CTRL_BODY
RESTORE_HOST_SPEC_CTRL_BODY
-3: cmpb $0, kvm_rebooting
+3: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 2b
ud2
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 838433798..b82e6ed4f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -732,13 +732,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
* Tracepoint for nested #vmexit because of interrupt pending
*/
TRACE_EVENT(kvm_invlpga,
- TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_PROTO(__u64 rip, unsigned int asid, u64 address),
TP_ARGS(rip, asid, address),
TP_STRUCT__entry(
- __field( __u64, rip )
- __field( int, asid )
- __field( __u64, address )
+ __field( __u64, rip )
+ __field( unsigned int, asid )
+ __field( __u64, address )
),
TP_fast_assign(
@@ -747,7 +747,7 @@ TRACE_EVENT(kvm_invlpga,
__entry->address = address;
),
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx",
__entry->rip, __entry->asid, __entry->address)
);
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index 313b8bb5b..fab6a1ad9 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -13,419 +13,6 @@
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
-/*
- * Enlightened VMCSv1 doesn't support these:
- *
- * POSTED_INTR_NV = 0x00000002,
- * GUEST_INTR_STATUS = 0x00000810,
- * APIC_ACCESS_ADDR = 0x00002014,
- * POSTED_INTR_DESC_ADDR = 0x00002016,
- * EOI_EXIT_BITMAP0 = 0x0000201c,
- * EOI_EXIT_BITMAP1 = 0x0000201e,
- * EOI_EXIT_BITMAP2 = 0x00002020,
- * EOI_EXIT_BITMAP3 = 0x00002022,
- * GUEST_PML_INDEX = 0x00000812,
- * PML_ADDRESS = 0x0000200e,
- * VM_FUNCTION_CONTROL = 0x00002018,
- * EPTP_LIST_ADDRESS = 0x00002024,
- * VMREAD_BITMAP = 0x00002026,
- * VMWRITE_BITMAP = 0x00002028,
- *
- * TSC_MULTIPLIER = 0x00002032,
- * PLE_GAP = 0x00004020,
- * PLE_WINDOW = 0x00004022,
- * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
- *
- * Currently unsupported in KVM:
- * GUEST_IA32_RTIT_CTL = 0x00002814,
- */
-#define EVMCS1_SUPPORTED_PINCTRL \
- (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
- PIN_BASED_EXT_INTR_MASK | \
- PIN_BASED_NMI_EXITING | \
- PIN_BASED_VIRTUAL_NMIS)
-
-#define EVMCS1_SUPPORTED_EXEC_CTRL \
- (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
- CPU_BASED_HLT_EXITING | \
- CPU_BASED_CR3_LOAD_EXITING | \
- CPU_BASED_CR3_STORE_EXITING | \
- CPU_BASED_UNCOND_IO_EXITING | \
- CPU_BASED_MOV_DR_EXITING | \
- CPU_BASED_USE_TSC_OFFSETTING | \
- CPU_BASED_MWAIT_EXITING | \
- CPU_BASED_MONITOR_EXITING | \
- CPU_BASED_INVLPG_EXITING | \
- CPU_BASED_RDPMC_EXITING | \
- CPU_BASED_INTR_WINDOW_EXITING | \
- CPU_BASED_CR8_LOAD_EXITING | \
- CPU_BASED_CR8_STORE_EXITING | \
- CPU_BASED_RDTSC_EXITING | \
- CPU_BASED_TPR_SHADOW | \
- CPU_BASED_USE_IO_BITMAPS | \
- CPU_BASED_MONITOR_TRAP_FLAG | \
- CPU_BASED_USE_MSR_BITMAPS | \
- CPU_BASED_NMI_WINDOW_EXITING | \
- CPU_BASED_PAUSE_EXITING | \
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
-
-#define EVMCS1_SUPPORTED_2NDEXEC \
- (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
- SECONDARY_EXEC_WBINVD_EXITING | \
- SECONDARY_EXEC_ENABLE_VPID | \
- SECONDARY_EXEC_ENABLE_EPT | \
- SECONDARY_EXEC_UNRESTRICTED_GUEST | \
- SECONDARY_EXEC_DESC | \
- SECONDARY_EXEC_ENABLE_RDTSCP | \
- SECONDARY_EXEC_ENABLE_INVPCID | \
- SECONDARY_EXEC_ENABLE_XSAVES | \
- SECONDARY_EXEC_RDSEED_EXITING | \
- SECONDARY_EXEC_RDRAND_EXITING | \
- SECONDARY_EXEC_TSC_SCALING | \
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
- SECONDARY_EXEC_PT_USE_GPA | \
- SECONDARY_EXEC_PT_CONCEAL_VMX | \
- SECONDARY_EXEC_BUS_LOCK_DETECTION | \
- SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
-
-#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
-
-#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
- VM_EXIT_SAVE_DEBUG_CONTROLS | \
- VM_EXIT_ACK_INTR_ON_EXIT | \
- VM_EXIT_HOST_ADDR_SPACE_SIZE | \
- VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
- VM_EXIT_SAVE_IA32_PAT | \
- VM_EXIT_LOAD_IA32_PAT | \
- VM_EXIT_SAVE_IA32_EFER | \
- VM_EXIT_LOAD_IA32_EFER | \
- VM_EXIT_CLEAR_BNDCFGS | \
- VM_EXIT_PT_CONCEAL_PIP | \
- VM_EXIT_CLEAR_IA32_RTIT_CTL)
-
-#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
- (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
- VM_ENTRY_LOAD_DEBUG_CONTROLS | \
- VM_ENTRY_IA32E_MODE | \
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
- VM_ENTRY_LOAD_IA32_PAT | \
- VM_ENTRY_LOAD_IA32_EFER | \
- VM_ENTRY_LOAD_BNDCFGS | \
- VM_ENTRY_PT_CONCEAL_PIP | \
- VM_ENTRY_LOAD_IA32_RTIT_CTL)
-
-#define EVMCS1_SUPPORTED_VMFUNC (0)
-
-#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
-#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
- {EVMCS1_OFFSET(name), clean_field}
-
-const struct evmcs_field vmcs_field_to_evmcs_1[] = {
- /* 64 bit rw */
- EVMCS1_FIELD(GUEST_RIP, guest_rip,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(GUEST_RSP, guest_rsp,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
- EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
- EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_CR0, host_cr0,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_CR3, host_cr3,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_CR4, host_cr4,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_RIP, host_rip,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
- EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
- EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
- EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
- EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
- EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(GUEST_CR0, guest_cr0,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(GUEST_CR3, guest_cr3,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(GUEST_CR4, guest_cr4,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(GUEST_DR7, guest_dr7,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
- EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(HOST_RSP, host_rsp,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
- EVMCS1_FIELD(EPT_POINTER, ept_pointer,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
- EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
- EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
- EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
- /*
- * Not used by KVM:
- *
- * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- * EVMCS1_FIELD(0x0000682A, guest_ssp,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
- * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- * EVMCS1_FIELD(0x00006C1A, host_ssp,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- */
-
- /* 64 bit read only */
- EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- /*
- * Not defined in KVM:
- *
- * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
- * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
- * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
- * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
- * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
- */
- EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
-
- /*
- * No mask defined in the spec as Hyper-V doesn't currently support
- * these. Future proof by resetting the whole clean field mask on
- * access.
- */
- EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-
- /* 32 bit rw */
- EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
- EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
- EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
- EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
- EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
- EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vm_entry_exception_error_code,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
- EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
- EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
- EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
- EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
- EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
- EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
-
- /* 32 bit read only */
- EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
- EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
-
- /* No mask defined in the spec (not used) */
- EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-
- /* 16 bit rw */
- EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
- EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
- EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
-};
-const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-
u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
@@ -608,40 +195,6 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
return 0;
}
-#if IS_ENABLED(CONFIG_HYPERV)
-DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
-
-/*
- * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
- * is: in case a feature has corresponding fields in eVMCS described and it was
- * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
- * feature which has no corresponding eVMCS field, this likely means that KVM
- * needs to be updated.
- */
-#define evmcs_check_vmcs_conf(field, ctrl) \
- do { \
- typeof(vmcs_conf->field) unsupported; \
- \
- unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
- if (unsupported) { \
- pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
- (u64)unsupported); \
- vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
- } \
- } \
- while (0)
-
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
-{
- evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
- evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
- evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
- evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
- evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
- evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
-}
-#endif
-
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 9623fe165..a87407412 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -2,199 +2,89 @@
#ifndef __KVM_X86_VMX_HYPERV_H
#define __KVM_X86_VMX_HYPERV_H
-#include <linux/jump_label.h>
-
-#include <asm/hyperv-tlfs.h>
-#include <asm/mshyperv.h>
-#include <asm/vmx.h>
-
-#include "../hyperv.h"
-
-#include "capabilities.h"
-#include "vmcs.h"
+#include <linux/kvm_host.h>
#include "vmcs12.h"
+#include "vmx.h"
-struct vmcs_config;
-
-#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
-
-#define KVM_EVMCS_VERSION 1
+#define EVMPTR_INVALID (-1ULL)
+#define EVMPTR_MAP_PENDING (-2ULL)
-struct evmcs_field {
- u16 offset;
- u16 clean_field;
+enum nested_evmptrld_status {
+ EVMPTRLD_DISABLED,
+ EVMPTRLD_SUCCEEDED,
+ EVMPTRLD_VMFAIL,
+ EVMPTRLD_ERROR,
};
-extern const struct evmcs_field vmcs_field_to_evmcs_1[];
-extern const unsigned int nr_evmcs_1_fields;
-
-static __always_inline int evmcs_field_offset(unsigned long field,
- u16 *clean_field)
-{
- unsigned int index = ROL16(field, 6);
- const struct evmcs_field *evmcs_field;
-
- if (unlikely(index >= nr_evmcs_1_fields))
- return -ENOENT;
-
- evmcs_field = &vmcs_field_to_evmcs_1[index];
-
- /*
- * Use offset=0 to detect holes in eVMCS. This offset belongs to
- * 'revision_id' but this field has no encoding and is supposed to
- * be accessed directly.
- */
- if (unlikely(!evmcs_field->offset))
- return -ENOENT;
-
- if (clean_field)
- *clean_field = evmcs_field->clean_field;
-
- return evmcs_field->offset;
-}
-
-static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
- unsigned long field, u16 offset)
+#ifdef CONFIG_KVM_HYPERV
+static inline bool evmptr_is_valid(u64 evmptr)
{
- /*
- * vmcs12_read_any() doesn't care whether the supplied structure
- * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
- * the exact offset of the required field, use it for convenience
- * here.
- */
- return vmcs12_read_any((void *)evmcs, field, offset);
+ return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
}
-#if IS_ENABLED(CONFIG_HYPERV)
-
-DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
-
-static __always_inline bool kvm_is_using_evmcs(void)
+static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx)
{
- return static_branch_unlikely(&__kvm_is_using_evmcs);
+ return evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
}
-static __always_inline int get_evmcs_offset(unsigned long field,
- u16 *clean_field)
+static inline bool evmptr_is_set(u64 evmptr)
{
- int offset = evmcs_field_offset(field, clean_field);
-
- WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
- return offset;
+ return evmptr != EVMPTR_INVALID;
}
-static __always_inline void evmcs_write64(unsigned long field, u64 value)
+static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx)
{
- u16 clean_field;
- int offset = get_evmcs_offset(field, &clean_field);
-
- if (offset < 0)
- return;
-
- *(u64 *)((char *)current_evmcs + offset) = value;
-
- current_evmcs->hv_clean_fields &= ~clean_field;
+ return evmptr_is_set(vmx->nested.hv_evmcs_vmptr);
}
-static __always_inline void evmcs_write32(unsigned long field, u32 value)
+static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
{
- u16 clean_field;
- int offset = get_evmcs_offset(field, &clean_field);
-
- if (offset < 0)
- return;
-
- *(u32 *)((char *)current_evmcs + offset) = value;
- current_evmcs->hv_clean_fields &= ~clean_field;
+ return vmx->nested.hv_evmcs;
}
-static __always_inline void evmcs_write16(unsigned long field, u16 value)
+static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
{
- u16 clean_field;
- int offset = get_evmcs_offset(field, &clean_field);
-
- if (offset < 0)
- return;
-
- *(u16 *)((char *)current_evmcs + offset) = value;
- current_evmcs->hv_clean_fields &= ~clean_field;
+ /*
+ * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
+ * eVMCS has been explicitly enabled by userspace.
+ */
+ return vcpu->arch.hyperv_enabled &&
+ to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
}
-static __always_inline u64 evmcs_read64(unsigned long field)
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
+uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
+int nested_enable_evmcs(struct kvm_vcpu *vcpu,
+ uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#else
+static inline bool evmptr_is_valid(u64 evmptr)
{
- int offset = get_evmcs_offset(field, NULL);
-
- if (offset < 0)
- return 0;
-
- return *(u64 *)((char *)current_evmcs + offset);
+ return false;
}
-static __always_inline u32 evmcs_read32(unsigned long field)
+static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx)
{
- int offset = get_evmcs_offset(field, NULL);
-
- if (offset < 0)
- return 0;
-
- return *(u32 *)((char *)current_evmcs + offset);
+ return false;
}
-static __always_inline u16 evmcs_read16(unsigned long field)
+static inline bool evmptr_is_set(u64 evmptr)
{
- int offset = get_evmcs_offset(field, NULL);
-
- if (offset < 0)
- return 0;
-
- return *(u16 *)((char *)current_evmcs + offset);
+ return false;
}
-static inline void evmcs_load(u64 phys_addr)
+static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx)
{
- struct hv_vp_assist_page *vp_ap =
- hv_get_vp_assist_page(smp_processor_id());
-
- if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
- vp_ap->nested_control.features.directhypercall = 1;
- vp_ap->current_nested_vmcs = phys_addr;
- vp_ap->enlighten_vmentry = 1;
+ return false;
}
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
-#else /* !IS_ENABLED(CONFIG_HYPERV) */
-static __always_inline bool kvm_is_using_evmcs(void) { return false; }
-static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
-static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
-static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
-static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
-static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
-static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
-static inline void evmcs_load(u64 phys_addr) {}
-#endif /* IS_ENABLED(CONFIG_HYPERV) */
-
-#define EVMPTR_INVALID (-1ULL)
-#define EVMPTR_MAP_PENDING (-2ULL)
-
-static inline bool evmptr_is_valid(u64 evmptr)
+static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
{
- return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
+ return NULL;
}
-
-enum nested_evmptrld_status {
- EVMPTRLD_DISABLED,
- EVMPTRLD_SUCCEEDED,
- EVMPTRLD_VMFAIL,
- EVMPTRLD_ERROR,
-};
-
-u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
-uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
-int nested_enable_evmcs(struct kvm_vcpu *vcpu,
- uint16_t *vmcs_version);
-void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
-bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
-void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#endif
#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c
new file mode 100644
index 000000000..904bfcd15
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common code for working with Enlightened VMCS which is
+ * used both by Hyper-V on KVM and KVM on Hyper-V.
+ */
+
+#include "hyperv_evmcs.h"
+
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ /*
+ * Not used by KVM:
+ *
+ * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x0000682A, guest_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1A, host_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ */
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
new file mode 100644
index 000000000..a543fccfc
--- /dev/null
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file contains common definitions for working with Enlightened VMCS which
+ * are used both by Hyper-V on KVM and KVM on Hyper-V.
+ */
+#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H
+#define __KVM_X86_VMX_HYPERV_EVMCS_H
+
+#include <asm/hyperv-tlfs.h>
+
+#include "capabilities.h"
+#include "vmcs12.h"
+
+#define KVM_EVMCS_VERSION 1
+
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+extern const struct evmcs_field vmcs_field_to_evmcs_1[];
+extern const unsigned int nr_evmcs_1_fields;
+
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
+{
+ const struct evmcs_field *evmcs_field;
+ unsigned int index = ROL16(field, 6);
+
+ if (unlikely(index >= nr_evmcs_1_fields))
+ return -ENOENT;
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c5ec0ef51..6329a3068 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -179,7 +179,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
* VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
* fields and thus must be synced.
*/
- if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
return kvm_skip_emulated_instruction(vcpu);
@@ -194,7 +194,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
* can't be done if there isn't a current VMCS.
*/
if (vmx->nested.current_vmptr == INVALID_GPA &&
- !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ !nested_vmx_is_evmptr12_valid(vmx))
return nested_vmx_failInvalid(vcpu);
return nested_vmx_failValid(vcpu, vm_instruction_error);
@@ -226,10 +226,11 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
+#ifdef CONFIG_KVM_HYPERV
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ if (nested_vmx_is_evmptr12_valid(vmx)) {
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
vmx->nested.hv_evmcs = NULL;
}
@@ -241,6 +242,34 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
hv_vcpu->nested.vm_id = 0;
hv_vcpu->nested.vp_id = 0;
}
+#endif
+}
+
+static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * When Enlightened VMEntry is enabled on the calling CPU we treat
+ * memory area pointer by vmptr as Enlightened VMCS (as there's no good
+ * way to distinguish it from VMCS12) and we must not corrupt it by
+ * writing to the non-existent 'launch_state' field. The area doesn't
+ * have to be the currently active EVMCS on the calling CPU and there's
+ * nothing KVM has to do to transition it from 'active' to 'non-active'
+ * state. It is possible that the area will stay mapped as
+ * vmx->nested.hv_evmcs but this shouldn't be a problem.
+ */
+ if (!guest_cpuid_has_evmcs(vcpu) ||
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))
+ return false;
+
+ if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
+ nested_release_evmcs(vcpu);
+
+ return true;
+#else
+ return false;
+#endif
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -572,7 +601,6 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
@@ -588,10 +616,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
* - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
* and tells KVM (L0) there were no changes in MSR bitmap for L2.
*/
- if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
- evmcs->hv_enlightenments_control.msr_bitmap &&
- evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
- return true;
+ if (!vmx->nested.force_msr_bitmap_recalc) {
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+
+ if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+ }
if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
@@ -1085,7 +1116,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_ept, bool reload_pdptrs,
enum vm_entry_failure_code *entry_failure_code)
{
- if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
+ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -1139,14 +1170,8 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
- * L2's VP_ID upon request from the guest. Make sure we check for
- * pending entries in the right FIFO upon L1/L2 transition as these
- * requests are put by other vCPUs asynchronously.
- */
- if (to_hv_vcpu(vcpu) && enable_ept)
- kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+ /* Handle pending Hyper-V TLB flush requests */
+ kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
@@ -1578,8 +1603,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
{
+#ifdef CONFIG_KVM_HYPERV
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
@@ -1818,12 +1844,16 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
*/
return;
+#else /* CONFIG_KVM_HYPERV */
+ KVM_BUG_ON(1, vmx->vcpu.kvm);
+#endif /* CONFIG_KVM_HYPERV */
}
static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
{
+#ifdef CONFIG_KVM_HYPERV
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
- struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
/*
* Should not be changed by KVM:
@@ -1992,6 +2022,9 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
return;
+#else /* CONFIG_KVM_HYPERV */
+ KVM_BUG_ON(1, vmx->vcpu.kvm);
+#endif /* CONFIG_KVM_HYPERV */
}
/*
@@ -2001,6 +2034,7 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
struct kvm_vcpu *vcpu, bool from_launch)
{
+#ifdef CONFIG_KVM_HYPERV
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
@@ -2082,13 +2116,16 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
}
return EVMPTRLD_SUCCEEDED;
+#else
+ return EVMPTRLD_DISABLED;
+#endif
}
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (nested_vmx_is_evmptr12_valid(vmx))
copy_vmcs12_to_enlightened(vmx);
else
copy_vmcs12_to_shadow(vmx);
@@ -2242,7 +2279,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
- if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx))
prepare_vmcs02_early_rare(vmx, vmcs12);
/*
@@ -2403,7 +2440,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
- struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
+ struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx);
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
@@ -2535,15 +2572,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
enum vm_entry_failure_code *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
bool load_guest_pdptrs_vmcs12 = false;
- if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
- load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
- !(vmx->nested.hv_evmcs->hv_clean_fields &
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) ||
+ !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
if (vmx->nested.nested_run_pending &&
@@ -2664,9 +2701,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* bits when it changes a field in eVMCS. Mark all fields as clean
* here.
*/
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
- vmx->nested.hv_evmcs->hv_clean_fields |=
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ if (nested_vmx_is_evmptr12_valid(vmx))
+ evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
return 0;
}
@@ -2717,7 +2753,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
}
/* Reserved bits should not be set */
- if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
+ if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
@@ -2888,8 +2924,10 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
+#ifdef CONFIG_KVM_HYPERV
if (guest_cpuid_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
+#endif
return 0;
}
@@ -2912,7 +2950,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
- CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
+ CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
return -EINVAL;
if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
@@ -3161,6 +3199,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
+#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3188,6 +3227,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
return true;
}
+#endif
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
@@ -3279,6 +3319,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+#ifdef CONFIG_KVM_HYPERV
/*
* Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
* in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
@@ -3295,6 +3336,7 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
return false;
}
+#endif
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
return false;
@@ -3538,7 +3580,7 @@ vmentry_fail_vmexit:
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason.full;
- if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
}
@@ -3569,7 +3611,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
return nested_vmx_failInvalid(vcpu);
- if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
+ if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
@@ -3584,8 +3626,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
- copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
+ if (nested_vmx_is_evmptr12_valid(vmx)) {
+ struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
+
+ copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields);
/* Enlightened VMCS doesn't have launch state */
vmcs12->launch_state = !launch;
} else if (enable_shadow_vmcs) {
@@ -4329,11 +4373,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (nested_vmx_is_evmptr12_valid(vmx))
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
- !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
+ !nested_vmx_is_evmptr12_valid(vmx);
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -4732,6 +4776,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+#ifdef CONFIG_KVM_HYPERV
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
/*
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
@@ -4741,6 +4786,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
*/
(void)nested_get_evmcs_page(vcpu);
}
+#endif
/* Service pending TLB flush requests for L2 before switching to L1. */
kvm_service_local_tlb_flush_requests(vcpu);
@@ -4854,7 +4900,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
}
if ((vm_exit_reason != -1) &&
- (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
+ (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
@@ -4980,6 +5026,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
else
*ret = off;
+ *ret = vmx_get_untagged_addr(vcpu, *ret, 0);
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
* non-canonical form. This is the only check on the memory
* destination for long mode!
@@ -5292,18 +5339,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (vmptr == vmx->nested.vmxon_ptr)
return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
- /*
- * When Enlightened VMEntry is enabled on the calling CPU we treat
- * memory area pointer by vmptr as Enlightened VMCS (as there's no good
- * way to distinguish it from VMCS12) and we must not corrupt it by
- * writing to the non-existent 'launch_state' field. The area doesn't
- * have to be the currently active EVMCS on the calling CPU and there's
- * nothing KVM has to do to transition it from 'active' to 'non-active'
- * state. It is possible that the area will stay mapped as
- * vmx->nested.hv_evmcs but this shouldn't be a problem.
- */
- if (likely(!guest_cpuid_has_evmcs(vcpu) ||
- !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
+ if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
@@ -5320,8 +5356,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12,
launch_state),
&zero, sizeof(zero));
- } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
- nested_release_evmcs(vcpu);
}
return nested_vmx_succeed(vcpu);
@@ -5360,7 +5394,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ if (!nested_vmx_is_evmptr12_valid(vmx)) {
/*
* In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMREAD sets the ALU flags for VMfailInvalid.
@@ -5398,7 +5432,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/* Read the field, zero-extended to a u64 value */
- value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
}
/*
@@ -5586,7 +5620,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (nested_vmx_is_evmptr12_valid(vmx))
return 1;
if (vmx->nested.current_vmptr != vmptr) {
@@ -5649,7 +5683,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
+ if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu))))
return 1;
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
@@ -5797,6 +5831,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ /*
+ * LAM doesn't apply to addresses that are inputs to TLB
+ * invalidation.
+ */
if (!operand.vpid ||
is_noncanonical_address(operand.gla, vcpu))
return nested_vmx_fail(vcpu,
@@ -6208,11 +6246,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
* Handle L2's bus locks in L0 directly.
*/
return true;
+#ifdef CONFIG_KVM_HYPERV
case EXIT_REASON_VMCALL:
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
kvm_hv_is_tlb_flush_hcall(vcpu);
+#endif
default:
break;
}
@@ -6435,7 +6475,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
- if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ if (nested_vmx_is_evmptr12_set(vmx))
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
if (is_guest_mode(vcpu) &&
@@ -6491,7 +6531,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
} else {
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ if (nested_vmx_is_evmptr12_valid(vmx))
/*
* L1 hypervisor is not obliged to keep eVMCS
* clean fields data always up-to-date while
@@ -6561,7 +6601,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* code was changed such that flag signals vmcs12 should
* be copied into eVMCS in guest memory.
*
- * To preserve backwards compatability, allow user
+ * To preserve backwards compatibility, allow user
* to set this flag even when there is no VMXON region.
*/
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
@@ -6632,6 +6672,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
+#ifdef CONFIG_KVM_HYPERV
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
/*
* nested_vmx_handle_enlightened_vmptrld() cannot be called
@@ -6641,6 +6682,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
*/
vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+#endif
} else {
return -EINVAL;
}
@@ -7096,7 +7138,9 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.set_state = vmx_set_nested_state,
.get_nested_state_pages = vmx_get_nested_state_pages,
.write_log_dirty = nested_vmx_write_pml_buffer,
+#ifdef CONFIG_KVM_HYPERV
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
.hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
+#endif
};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b4b9d5143..cce4e2aa3 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -3,6 +3,7 @@
#define __KVM_X86_VMX_NESTED_H
#include "kvm_cache_regs.h"
+#include "hyperv.h"
#include "vmcs12.h"
#include "vmx.h"
@@ -57,7 +58,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
/* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
return vmx->nested.current_vmptr != -1ull ||
- vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID;
+ nested_vmx_is_evmptr12_set(vmx);
}
static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 90c1f7f07..600a021ae 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -71,7 +71,7 @@ static int fixed_pmc_events[] = {
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
struct kvm_pmc *pmc;
- u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
+ u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
int i;
pmu->fixed_ctr_ctrl = data;
@@ -437,11 +437,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc_write_counter(pmc, data);
- pmc_update_sample_period(pmc);
break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc_write_counter(pmc, data);
- pmc_update_sample_period(pmc);
break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
reserved_bits = pmu->reserved_bits;
@@ -493,19 +491,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
u64 counter_mask;
int i;
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
- pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
-
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
/*
@@ -517,8 +502,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
entry = kvm_find_cpuid_entry(vcpu, 0xa);
- if (!entry || !vcpu->kvm->arch.enable_pmu)
+ if (!entry)
return;
+
eax.full = entry->eax;
edx.full = entry->edx;
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index edc3f16cc..6a9bfdfbb 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,7 +2,10 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME (1 << 0)
-#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 3e822e582..6fef01e05 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -37,6 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
if (!IS_ALIGNED(*gva, alignment)) {
fault = true;
} else if (likely(is_64_bit_mode(vcpu))) {
+ *gva = vmx_get_untagged_addr(vcpu, *gva, 0);
fault = is_noncanonical_address(*gva, vcpu);
} else {
*gva &= 0xffffffff;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index be275a041..f6986dee6 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- test $VMX_RUN_VMRESUME, %ebx
+ bt $VMX_RUN_VMRESUME_SHIFT, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -161,8 +161,11 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
- jz .Lvmlaunch
+ /* Clobbers EFLAGS.ZF */
+ CLEAR_CPU_BUFFERS
+
+ /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
+ jnc .Lvmlaunch
/*
* After a successful VMRESUME/VMLAUNCH, control flow "magically"
@@ -272,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
call vmx_spec_ctrl_restore_host
+ CLEAR_BRANCH_HISTORY_VMEXIT
+
/* Put return value in AX */
mov %_ASM_BX, %_ASM_AX
@@ -289,7 +294,7 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
RET
.Lfixup:
- cmpb $0, kvm_rebooting
+ cmpb $0, _ASM_RIP(kvm_rebooting)
jne .Lvmfail
ud2
.Lvmfail:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9bba53525..784f2ecca 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -66,6 +66,7 @@
#include "vmx.h"
#include "x86.h"
#include "smm.h"
+#include "vmx_onhyperv.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -82,28 +83,28 @@ bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
- enable_unrestricted_guest, bool, S_IRUGO);
+ enable_unrestricted_guest, bool, 0444);
bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
@@ -114,10 +115,10 @@ module_param(enable_ipiv, bool, 0444);
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
+module_param(nested, bool, 0444);
bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
static bool __read_mostly error_on_inconsistent_vmcs_config = true;
module_param(error_on_inconsistent_vmcs_config, bool, 0444);
@@ -387,7 +388,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
- vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ /*
+ * Disable VERW's behavior of clearing CPU buffers for the guest if the
+ * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
+ * the mitigation. Disabling the clearing behavior provides a
+ * performance boost for guests that aren't aware that manually clearing
+ * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
+ * and VM-Exit.
+ */
+ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
+ (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -523,22 +533,14 @@ module_param(enlightened_vmcs, bool, 0444);
static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
- struct hv_partition_assist_pg **p_hv_pa_pg =
- &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
- /*
- * Synthetic VM-Exit is not enabled in current code and so All
- * evmcs in singe VM shares same assist page.
- */
- if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
- if (!*p_hv_pa_pg)
+ if (partition_assist_page == INVALID_PAGE)
return -ENOMEM;
evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
- evmcs->partition_assist_page =
- __pa(*p_hv_pa_pg);
+ evmcs->partition_assist_page = partition_assist_page;
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
@@ -745,7 +747,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
*/
static int kvm_cpu_vmxoff(void)
{
- asm_volatile_goto("1: vmxoff\n\t"
+ asm goto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
@@ -1657,8 +1659,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1669,9 +1671,9 @@ static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
*/
if (to_vmx(vcpu)->exit_reason.enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return true;
+ return X86EMUL_CONTINUE;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -1809,7 +1811,7 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu)
* do generate error codes with bits 31:16 set, and so KVM's
* ABI lets userspace shove in arbitrary 32-bit values. Drop
* the upper bits to avoid VM-Fail, losing information that
- * does't really exist is preferable to killing the VM.
+ * doesn't really exist is preferable to killing the VM.
*/
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -2055,6 +2057,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
return 1;
+#ifdef CONFIG_KVM_HYPERV
/*
* Enlightened VMCS v1 doesn't have certain VMCS fields but
* instead of just ignoring the features, different Hyper-V
@@ -2065,6 +2068,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
+#endif
break;
case MSR_IA32_RTIT_CTL:
if (!vmx_pt_mode_is_host_guest())
@@ -2789,7 +2793,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
cr4_set_bits(X86_CR4_VMXE);
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
: : [vmxon_pointer] "m"(vmxon_pointer)
: : fault);
@@ -3400,7 +3404,8 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
+ kvm_get_active_cr3_lam_bits(vcpu);
}
if (update_guest_cr3)
@@ -4833,7 +4838,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->nested.posted_intr_nv = -1;
vmx->nested.vmxon_ptr = INVALID_GPA;
vmx->nested.current_vmptr = INVALID_GPA;
+
+#ifdef CONFIG_KVM_HYPERV
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+#endif
vcpu->arch.microcode_version = 0x100000000ULL;
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
@@ -5782,7 +5790,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -5792,7 +5800,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -6757,10 +6765,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
return;
/*
- * Grab the memslot so that the hva lookup for the mmu_notifier retry
- * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
- * on the pfn lookup's validation of the memslot to ensure a valid hva
- * is used for the retry check.
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
*/
slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
@@ -6785,8 +6793,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
return;
read_lock(&vcpu->kvm->mmu_lock);
- if (mmu_invalidate_retry_hva(kvm, mmu_seq,
- gfn_to_hva_memslot(slot, gfn))) {
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
read_unlock(&vcpu->kvm->mmu_lock);
goto out;
@@ -7226,11 +7233,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ /*
+ * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
+ * mitigation for MDS is done late in VMentry and is still
+ * executed in spite of L1D Flush. This is because an extra VERW
+ * should not matter much after the big hammer L1D Flush.
+ */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
else if (static_branch_unlikely(&mmio_stale_data_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
mds_clear_cpu_buffers();
@@ -7579,8 +7589,6 @@ static int vmx_vm_init(struct kvm *kvm)
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u8 cache;
-
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
* We have to be careful as to what are honored and when.
@@ -7607,11 +7615,10 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cache = MTRR_TYPE_WRBACK;
+ return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
else
- cache = MTRR_TYPE_UNCACHABLE;
-
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
+ VMX_EPT_IPAT_BIT;
}
return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
@@ -7677,6 +7684,9 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
+ cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
+
#undef cr4_fixed1_update
}
@@ -7763,6 +7773,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
vmx_setup_uret_msrs(vmx);
@@ -7846,8 +7857,28 @@ static u64 vmx_get_perf_capabilities(void)
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
}
return perf_cap;
@@ -8209,6 +8240,50 @@ static void vmx_vm_destroy(struct kvm *kvm)
free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
}
+/*
+ * Note, the SDM states that the linear address is masked *after* the modified
+ * canonicality check, whereas KVM masks (untags) the address and then performs
+ * a "normal" canonicality check. Functionally, the two methods are identical,
+ * and when the masking occurs relative to the canonicality check isn't visible
+ * to software, i.e. KVM's behavior doesn't violate the SDM.
+ */
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
+{
+ int lam_bit;
+ unsigned long cr3_bits;
+
+ if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
+ return gva;
+
+ if (!is_64_bit_mode(vcpu))
+ return gva;
+
+ /*
+ * Bit 63 determines if the address should be treated as user address
+ * or a supervisor address.
+ */
+ if (!(gva & BIT_ULL(63))) {
+ cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
+ if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
+ return gva;
+
+ /* LAM_U48 is ignored if LAM_U57 is set. */
+ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
+ } else {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
+ return gva;
+
+ lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
+ }
+
+ /*
+ * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
+ * Bit 63 is retained from the raw virtual address so that untagging
+ * doesn't change a user access to a supervisor access, and vice versa.
+ */
+ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = KBUILD_MODNAME,
@@ -8341,7 +8416,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.enable_smi_window = vmx_enable_smi_window,
#endif
- .can_emulate_instruction = vmx_can_emulate_instruction,
+ .check_emulate_instruction = vmx_check_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
.migrate_timers = vmx_migrate_timers,
@@ -8349,6 +8424,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.complete_emulated_msr = kvm_complete_insn_gp,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
};
static unsigned int vmx_handle_intel_pt_intr(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index c2130d2c8..e3b0985bb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -241,9 +241,11 @@ struct nested_vmx {
bool guest_mode;
} smm;
+#ifdef CONFIG_KVM_HYPERV
gpa_t hv_evmcs_vmptr;
struct kvm_host_map hv_evmcs_map;
struct hv_enlightened_vmcs *hv_evmcs;
+#endif
};
struct vcpu_vmx {
@@ -420,6 +422,8 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
+
static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
int type, bool value)
{
@@ -745,14 +749,4 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
-static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
-{
- /*
- * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
- * eVMCS has been explicitly enabled by userspace.
- */
- return vcpu->arch.hyperv_enabled &&
- to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
-}
-
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.c b/arch/x86/kvm/vmx/vmx_onhyperv.c
new file mode 100644
index 000000000..b9a8b9116
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "capabilities.h"
+#include "vmx_onhyperv.h"
+
+DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
new file mode 100644
index 000000000..eb48153bf
--- /dev/null
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__
+#define __ARCH_X86_KVM_VMX_ONHYPERV_H__
+
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+
+#include <linux/jump_label.h>
+
+#include "capabilities.h"
+#include "hyperv_evmcs.h"
+#include "vmcs12.h"
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+static __always_inline bool kvm_is_using_evmcs(void)
+{
+ return static_branch_unlikely(&__kvm_is_using_evmcs);
+}
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
+ return offset;
+}
+
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static __always_inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static __always_inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static inline void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ vp_ap->nested_control.features.directhypercall = 1;
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static __always_inline bool kvm_is_using_evmcs(void) { return false; }
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+#endif /* __ARCH_X86_KVM_VMX_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 33af7b4c6..8060e5fc6 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -6,7 +6,7 @@
#include <asm/vmx.h>
-#include "hyperv.h"
+#include "vmx_onhyperv.h"
#include "vmcs.h"
#include "../x86.h"
@@ -94,7 +94,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
- asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ asm_goto_output("1: vmread %[field], %[output]\n\t"
"jna %l[do_fail]\n\t"
_ASM_EXTABLE(1b, %l[do_exception])
@@ -188,7 +188,7 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
#define vmx_asm1(insn, op1, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ asm goto("1: " __stringify(insn) " %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
@@ -205,7 +205,7 @@ fault: \
#define vmx_asm2(insn, op1, op2, error_args...) \
do { \
- asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ asm goto("1: " __stringify(insn) " %1, %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e179db7c1..c84927216 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
* tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
/*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
@@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
@@ -1284,7 +1284,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* stuff CR3, e.g. for RSM emulation, and there is no guarantee that
* the current vCPU mode is accurate.
*/
- if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ if (!kvm_vcpu_is_legal_cr3(vcpu, cr3))
return 1;
if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
@@ -1504,6 +1504,8 @@ static unsigned num_msrs_to_save;
static const u32 emulated_msrs_all[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
@@ -1521,6 +1523,7 @@ static const u32 emulated_msrs_all[] = {
HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
@@ -1620,7 +1623,8 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1652,6 +1656,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_SSB_NO;
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ data |= ARCH_CAP_RFDS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
@@ -1701,22 +1707,17 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
struct kvm_msr_entry msr;
int r;
+ /* Unconditionally clear the output for simplicity */
+ msr.data = 0;
msr.index = index;
r = kvm_get_msr_feature(&msr);
- if (r == KVM_MSR_RET_INVALID) {
- /* Unconditionally clear the output for simplicity */
- *data = 0;
- if (kvm_msr_ignored_check(index, 0, false))
- r = 0;
- }
-
- if (r)
- return r;
+ if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
+ r = 0;
*data = msr.data;
- return 0;
+ return r;
}
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1779,6 +1780,10 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
+ if (!static_cpu_has(X86_FEATURE_XSAVES) &&
+ (efer & EFER_SVME))
+ kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+
return 0;
}
@@ -2331,14 +2336,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
return;
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. We do the reverse here.
- */
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
- wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
@@ -2509,32 +2509,35 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
}
#ifdef CONFIG_X86_64
-static inline int gtod_is_based_on_tsc(int mode)
+static inline bool gtod_is_based_on_tsc(int mode)
{
return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
#endif
-static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation)
{
#ifdef CONFIG_X86_64
- bool vcpus_matched;
struct kvm_arch *ka = &vcpu->kvm->arch;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
- vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
- atomic_read(&vcpu->kvm->online_vcpus));
+ /*
+ * To use the masterclock, the host clocksource must be based on TSC
+ * and all vCPUs must have matching TSCs. Note, the count for matching
+ * vCPUs doesn't include the reference vCPU, hence "+1".
+ */
+ bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus)) &&
+ gtod_is_based_on_tsc(gtod->clock.vclock_mode);
/*
- * Once the masterclock is enabled, always perform request in
- * order to update it.
- *
- * In order to enable masterclock, the host clocksource must be TSC
- * and the vcpus need to have matched TSCs. When that happens,
- * perform request to enable masterclock.
+ * Request a masterclock update if the masterclock needs to be toggled
+ * on/off, or when starting a new generation and the masterclock is
+ * enabled (compute_guest_tsc() requires the masterclock snapshot to be
+ * taken _after_ the new generation is created).
*/
- if (ka->use_master_clock ||
- (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
+ if ((ka->use_master_clock && new_generation) ||
+ (ka->use_master_clock != use_master_clock))
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
@@ -2711,11 +2714,12 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- kvm_track_tsc_matching(vcpu);
+ kvm_track_tsc_matching(vcpu, !matched);
}
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
{
+ u64 data = user_value ? *user_value : 0;
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
@@ -2730,25 +2734,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
if (vcpu->arch.virtual_tsc_khz) {
if (data == 0) {
/*
- * detection of vcpu initialization -- need to sync
- * with other vCPUs. This particularly helps to keep
- * kvm_clock stable after CPU hotplug
+ * Force synchronization when creating a vCPU, or when
+ * userspace explicitly writes a zero value.
*/
synchronizing = true;
- } else {
+ } else if (kvm->arch.user_set_tsc) {
u64 tsc_exp = kvm->arch.last_tsc_write +
nsec_to_cycles(vcpu, elapsed);
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/*
- * Special case: TSC write with a small delta (1 second)
- * of virtual cycle time against real time is
- * interpreted as an attempt to synchronize the CPU.
+ * Here lies UAPI baggage: when a user-initiated TSC write has
+ * a small delta (1 second) of virtual cycle time against the
+ * previously set vCPU, we assume that they were intended to be
+ * in sync and the delta was only due to the racy nature of the
+ * legacy API.
+ *
+ * This trick falls down when restoring a guest which genuinely
+ * has been running for less time than the 1 second of imprecision
+ * which we allow for in the legacy API. In this case, the first
+ * value written by userspace (on any vCPU) should not be subject
+ * to this 'correction' to make it sync up with values that only
+ * come from the kernel's default vCPU creation. Make the 1-second
+ * slop hack only trigger if the user_set_tsc flag is already set.
*/
synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp;
}
}
+ if (user_value)
+ kvm->arch.user_set_tsc = true;
+
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
@@ -3096,7 +3112,8 @@ u64 get_kvmclock_ns(struct kvm *kvm)
static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
struct gfn_to_pfn_cache *gpc,
- unsigned int offset)
+ unsigned int offset,
+ bool force_tsc_unstable)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
@@ -3133,6 +3150,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
}
memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+
+ if (force_tsc_unstable)
+ guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
smp_wmb();
guest_hv_clock->version = ++vcpu->hv_clock.version;
@@ -3153,6 +3174,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
u64 tsc_timestamp, host_tsc;
u8 pvclock_flags;
bool use_master_clock;
+#ifdef CONFIG_KVM_XEN
+ /*
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ */
+ bool xen_pvclock_tsc_unstable =
+ ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+#endif
kernel_ns = 0;
host_tsc = 0;
@@ -3231,17 +3262,97 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time.active)
- kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
+#ifdef CONFIG_KVM_XEN
if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
+ offsetof(struct compat_vcpu_info, time),
+ xen_pvclock_tsc_unstable);
if (vcpu->xen.vcpu_time_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
+ xen_pvclock_tsc_unstable);
+#endif
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct pvclock_vcpu_time_info hv_clock;
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned long seq, local_tsc_khz;
+ struct timespec64 ts;
+ uint64_t host_tsc;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+
+ local_tsc_khz = 0;
+ if (!ka->use_master_clock)
+ break;
+
+ /*
+ * The TSC read and the call to get_cpu_tsc_khz() must happen
+ * on the same CPU.
+ */
+ get_cpu();
+
+ local_tsc_khz = get_cpu_tsc_khz();
+
+ if (local_tsc_khz &&
+ !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+ local_tsc_khz = 0; /* Fall back to old method */
+
+ put_cpu();
+
+ /*
+ * These values must be snapshotted within the seqcount loop.
+ * After that, it's just mathematics which can happen on any
+ * CPU at any time.
+ */
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+ /*
+ * If the conditions were right, and obtaining the wallclock+TSC was
+ * successful, calculate the KVM clock at the corresponding time and
+ * subtract one from the other to get the guest's epoch in nanoseconds
+ * since 1970-01-01.
+ */
+ if (local_tsc_khz) {
+ kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+ __pvclock_read_cycles(&hv_clock, host_tsc);
+ }
+#endif
+ return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
+/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
* the rest of the vcpus to remain static. Otherwise ntp frequency
@@ -3290,9 +3401,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
- if (!kvmclock_periodic_sync)
- return;
-
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
@@ -3314,7 +3422,7 @@ static bool is_mci_status_msr(u32 msr)
static bool can_set_mci_status(struct kvm_vcpu *vcpu)
{
/* McStatusWrEn enabled? */
- if (guest_cpuid_is_amd_or_hygon(vcpu))
+ if (guest_cpuid_is_amd_compatible(vcpu))
return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
return false;
@@ -3671,17 +3779,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
break;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
- return 1;
+ case MSR_IA32_PRED_CMD: {
+ u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+ if (!msr_info->host_initiated) {
+ if ((!guest_has_pred_cmd_msr(vcpu)))
+ return 1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
- if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ if (!boot_cpu_has(X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+
+ if (data & reserved_bits)
return 1;
+
if (!data)
break;
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ wrmsrl(MSR_IA32_PRED_CMD, data);
break;
+ }
case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3701,13 +3828,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
- /* Handle McStatusWrEn */
- if (data == BIT_ULL(18)) {
- vcpu->arch.msr_hwcr = data;
- } else if (data != 0) {
+ /*
+ * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+ * through at least v6.6 whine if TscFreqSel is clear,
+ * depending on F/M/S.
+ */
+ if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
+ vcpu->arch.msr_hwcr = data;
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
@@ -3778,7 +3908,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
- kvm_synchronize_tsc(vcpu, data);
+ kvm_synchronize_tsc(vcpu, &data);
} else {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
@@ -3915,6 +4045,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* the need to ignore the workaround.
*/
break;
+#ifdef CONFIG_KVM_HYPERV
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
case HV_X64_MSR_SYNDBG_OPTIONS:
@@ -3927,6 +4058,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
+#endif
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
@@ -4272,6 +4404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
msr_info->data = 0x20000000;
break;
+#ifdef CONFIG_KVM_HYPERV
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
case HV_X64_MSR_SYNDBG_OPTIONS:
@@ -4285,6 +4418,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
+#endif
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
* silicon. It is however accessed by winxp in very narrow
@@ -4422,6 +4556,7 @@ static inline bool kvm_can_mwait_in_guest(void)
boot_cpu_has(X86_FEATURE_ARAT);
}
+#ifdef CONFIG_KVM_HYPERV
static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 __user *cpuid_arg)
{
@@ -4442,6 +4577,14 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
return 0;
}
+#endif
+
+static bool kvm_is_vm_type_supported(unsigned long type)
+{
+ return type == KVM_X86_DEFAULT_VM ||
+ (type == KVM_X86_SW_PROTECTED_VM &&
+ IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
+}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
@@ -4468,9 +4611,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
+#ifdef CONFIG_KVM_HYPERV
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_TIME:
case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_HYPERV_SYNIC2:
case KVM_CAP_HYPERV_VP_INDEX:
@@ -4480,6 +4625,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_CPUID:
case KVM_CAP_HYPERV_ENFORCE_CPUID:
case KVM_CAP_SYS_HYPERV_CPUID:
+#endif
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -4489,7 +4635,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
- case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
@@ -4520,6 +4665,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_MEMORY_FAULT_INFO:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4533,7 +4679,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -4599,12 +4746,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_x86_ops.nested_ops->get_state ?
kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
+#ifdef CONFIG_KVM_HYPERV
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
+#endif
case KVM_CAP_SMALLER_MAXPHYADDR:
r = (int) allow_smaller_maxphyaddr;
break;
@@ -4633,6 +4782,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_X86_NOTIFY_VMEXIT:
r = kvm_caps.has_notify_vmexit;
break;
+ case KVM_CAP_VM_TYPES:
+ r = BIT(KVM_X86_DEFAULT_VM);
+ if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
+ r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ break;
default:
break;
}
@@ -4766,9 +4920,11 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
+#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
break;
+#endif
case KVM_GET_DEVICE_ATTR: {
struct kvm_device_attr attr;
r = -EFAULT;
@@ -5300,7 +5456,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
vcpu->arch.nmi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
- kvm_make_request(KVM_REQ_NMI, vcpu);
+ if (events->nmi.pending)
+ kvm_make_request(KVM_REQ_NMI, vcpu);
}
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
@@ -5413,8 +5570,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@ -5549,6 +5706,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
+ kvm->arch.user_set_tsc = true;
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -5593,14 +5751,11 @@ static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
- int r;
- uint16_t vmcs_version;
- void __user *user_ptr;
-
if (cap->flags)
return -EINVAL;
switch (cap->cap) {
+#ifdef CONFIG_KVM_HYPERV
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
@@ -5612,16 +5767,22 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
- if (!kvm_x86_ops.nested_ops->enable_evmcs)
- return -ENOTTY;
- r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
- if (!r) {
- user_ptr = (void __user *)(uintptr_t)cap->args[0];
- if (copy_to_user(user_ptr, &vmcs_version,
- sizeof(vmcs_version)))
- r = -EFAULT;
+ {
+ int r;
+ uint16_t vmcs_version;
+ void __user *user_ptr;
+
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
+ return -ENOTTY;
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
+ if (!r) {
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
+ if (copy_to_user(user_ptr, &vmcs_version,
+ sizeof(vmcs_version)))
+ r = -EFAULT;
+ }
+ return r;
}
- return r;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
@@ -5630,6 +5791,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+#endif
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
@@ -6022,9 +6184,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
+#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
+#endif
#ifdef CONFIG_KVM_XEN
case KVM_XEN_VCPU_GET_ATTR: {
struct kvm_xen_vcpu_attr xva;
@@ -6261,6 +6425,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
+ if (!kvm_x86_ops.cpu_dirty_log_size)
+ return;
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
@@ -6852,6 +7019,9 @@ set_identity_unlock:
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
+ r = -ENOENT;
+ if (!pic_in_kernel(kvm))
+ goto create_pit_unlock;
r = -ENOMEM;
kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm->arch.vpit)
@@ -7079,6 +7249,7 @@ set_pit2_out:
r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
break;
}
+#ifdef CONFIG_KVM_HYPERV
case KVM_HYPERV_EVENTFD: {
struct kvm_hyperv_eventfd hvevfd;
@@ -7088,6 +7259,7 @@ set_pit2_out:
r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
break;
}
+#endif
case KVM_SET_PMU_EVENT_FILTER:
r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
break;
@@ -7487,11 +7659,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
-static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
- return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7501,8 +7673,10 @@ int handle_ud(struct kvm_vcpu *vcpu)
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ int r;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+ r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+ if (r != X86EMUL_CONTINUE)
return 1;
if (fep_flags &&
@@ -7836,6 +8010,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (r < 0)
return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * Mark the page dirty _before_ checking whether or not the CMPXCHG was
+ * successful, as the old value is written back on failure. Note, for
+ * live migration, this is unnecessarily conservative as CMPXCHG writes
+ * back the original value and the access is atomic, but KVM's ABI is
+ * that all writes are dirty logged, regardless of the value written.
+ */
+ kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
+
if (r)
return X86EMUL_CMPXCHG_FAILED;
@@ -8321,6 +8505,15 @@ static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
kvm_vm_bugged(kvm);
}
+static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, unsigned int flags)
+{
+ if (!kvm_x86_ops.get_untagged_addr)
+ return addr;
+
+ return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.vm_bugged = emulator_vm_bugged,
.read_gpr = emulator_read_gpr,
@@ -8365,6 +8558,7 @@ static const struct x86_emulate_ops emulate_ops = {
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
+ .get_untagged_addr = emulator_get_untagged_addr,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -8884,8 +9078,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
- return 1;
+ r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+ if (r != X86EMUL_CONTINUE) {
+ if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+
+ WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
vcpu->arch.l1tf_flush_l1d = true;
@@ -10048,7 +10248,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
*
* But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
* the instruction or inject an exception, then KVM can incorrecty inject a new
- * asynchrounous event if the event became pending after the CPU fetched the
+ * asynchronous event if the event became pending after the CPU fetched the
* instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
* occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
* injected on the restarted instruction instead of being deferred until the
@@ -10069,7 +10269,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
int r;
/*
- * Process nested events first, as nested VM-Exit supercedes event
+ * Process nested events first, as nested VM-Exit supersedes event
* re-injection. If there's an event queued for re-injection, it will
* be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
*/
@@ -10458,19 +10658,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
+#ifdef CONFIG_KVM_HYPERV
if (to_hv_vcpu(vcpu)) {
+ u64 eoi_exit_bitmap[4];
+
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
-
+#endif
static_call_cond(kvm_x86_load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
@@ -10561,9 +10762,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* the flushes are considered "remote" and not "local" because
* the requests can be initiated from other vCPUs.
*/
+#ifdef CONFIG_KVM_HYPERV
if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
kvm_hv_vcpu_flush_tlb(vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
+#endif
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -10589,16 +10792,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
- if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_pmu_handle_event(vcpu);
- if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
@@ -10616,6 +10819,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu_load_eoi_exitmap(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
kvm_vcpu_reload_apic_access_page(vcpu);
+#ifdef CONFIG_KVM_HYPERV
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
@@ -10646,6 +10850,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+#endif
if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
@@ -10767,7 +10972,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
* update must kick and wait for all vCPUs before toggling the
- * per-VM state, and responsing vCPUs must wait for the update
+ * per-VM state, and responding vCPUs must wait for the update
* to complete before servicing KVM_REQ_APICV_UPDATE.
*/
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
@@ -10964,6 +11169,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
@@ -11481,7 +11687,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
*/
if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
return false;
- if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
+ if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3))
return false;
} else {
/*
@@ -11534,7 +11740,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
- vcpu->arch.cr0 = sregs->cr0;
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11578,8 +11783,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (ret)
return ret;
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
@@ -11620,8 +11827,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
mmu_reset_needed = 1;
vcpu->arch.pdptrs_from_userspace = true;
}
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
return 0;
}
@@ -11972,7 +12181,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- kvm_synchronize_tsc(vcpu, 0);
+ kvm_synchronize_tsc(vcpu, NULL);
vcpu_put(vcpu);
/* poll control enabled by default */
@@ -12087,7 +12296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
}
if (!init_event) {
- kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_misc_features_enables = 0;
@@ -12304,7 +12512,9 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_free_vm(struct kvm *kvm)
{
- kfree(to_kvm_hv(kvm)->hv_pa_pg);
+#if IS_ENABLED(CONFIG_HYPERV)
+ kfree(kvm->arch.hv_pa_pg);
+#endif
__kvm_arch_free_vm(kvm);
}
@@ -12314,9 +12524,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
int ret;
unsigned long flags;
- if (type)
+ if (!kvm_is_vm_type_supported(type))
return -EINVAL;
+ kvm->arch.vm_type = type;
+
ret = kvm_page_track_init(kvm);
if (ret)
goto out;
@@ -12328,7 +12540,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
goto out_uninit_mmu;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
- INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -12456,8 +12667,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
hva = slot->userspace_addr;
}
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- struct kvm_userspace_memory_region m;
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_userspace_memory_region2 m;
m.slot = id | (i << 16);
m.flags = 0;
@@ -12607,6 +12818,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+ kvm_mmu_init_memslot_memory_attributes(kvm, slot);
+#endif
+
if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
@@ -12912,7 +13127,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return vcpu->arch.preempted_in_kernel;
+ if (vcpu != kvm_get_running_vcpu())
+ return vcpu->arch.preempted_in_kernel;
+
+ return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
@@ -13204,15 +13422,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA assignment and de-assignment will affect
+ * whether KVM honors guest MTRRs and cause changes in memtypes
+ * in TDP.
+ * So, pass %true unconditionally to indicate non-coherent DMA was,
+ * or will be involved, and that zapping SPTEs might be necessary.
+ */
+ if (__kvm_mmu_honors_guest_mtrrs(true))
+ kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.noncoherent_dma_count);
+ if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
{
- atomic_dec(&kvm->arch.noncoherent_dma_count);
+ if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
@@ -13399,6 +13632,10 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
switch (type) {
case INVPCID_TYPE_INDIV_ADDR:
+ /*
+ * LAM doesn't apply to addresses that are inputs to TLB
+ * invalidation.
+ */
if ((!pcid_enabled && (operand.pcid != 0)) ||
is_noncanonical_address(operand.gla, vcpu)) {
kvm_inject_gp(vcpu, 0);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 1e7be1f6a..2f7e19166 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
@@ -529,6 +530,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
__reserved_bits |= X86_CR4_VMXE; \
if (!__cpu_has(__c, X86_FEATURE_PCID)) \
__reserved_bits |= X86_CR4_PCIDE; \
+ if (!__cpu_has(__c, X86_FEATURE_LAM)) \
+ __reserved_bits |= X86_CR4_LAM_SUP; \
__reserved_bits; \
})
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 40edf4d19..b0212ba2d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
* This code mirrors kvm_write_wall_clock() except that it writes
* directly through the pfn cache and doesn't mark the page dirty.
*/
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
wc_version = wc->version = (wc->version + 1) | 1;
smp_wmb();
- wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc->sec = (u32)wall_nsec;
*wc_sec_hi = wall_nsec >> 32;
smp_wmb();
@@ -134,9 +134,23 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
{
struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
arch.xen.timer);
+ struct kvm_xen_evtchn e;
+ int rc;
+
if (atomic_read(&vcpu->arch.xen.timer_pending))
return HRTIMER_NORESTART;
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+ if (rc != -EWOULDBLOCK) {
+ vcpu->arch.xen.timer_expires = 0;
+ return HRTIMER_NORESTART;
+ }
+
atomic_inc(&vcpu->arch.xen.timer_pending);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
@@ -146,6 +160,14 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
{
+ /*
+ * Avoid races with the old timer firing. Checking timer_expires
+ * to avoid calling hrtimer_cancel() will only have false positives
+ * so is fine.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
@@ -471,7 +493,7 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
int r;
@@ -1019,9 +1041,36 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ /*
+ * Ensure a consistent snapshot of state is captured, with a
+ * timer either being pending, or the event channel delivered
+ * to the corresponding bit in the shared_info. Not still
+ * lurking in the timer_pending flag for deferred delivery.
+ * Purely as an optimisation, if the timer_expires field is
+ * zero, that means the timer isn't active (or even in the
+ * timer_pending flag) and there is no need to cancel it.
+ */
+ if (vcpu->arch.xen.timer_expires) {
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ kvm_xen_inject_timer_irqs(vcpu);
+ }
+
data->u.timer.port = vcpu->arch.xen.timer_virq;
data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+
+ /*
+ * The hrtimer may trigger and raise the IRQ immediately,
+ * while the returned state causes it to be set up and
+ * raised again on the destination system after migration.
+ * That's fine, as the guest won't even have had a chance
+ * to run and handle the interrupt. Asserting an already
+ * pending event channel is idempotent.
+ */
+ if (vcpu->arch.xen.timer_expires)
+ hrtimer_start_expires(&vcpu->arch.xen.timer,
+ HRTIMER_MODE_ABS_HARD);
+
r = 0;
break;
@@ -1113,7 +1162,9 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
/* Only some feature flags need to be *enabled* by userspace */
u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ u32 old_flags;
if (xhc->flags & ~permitted_flags)
return -EINVAL;
@@ -1134,9 +1185,14 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
+ old_flags = kvm->arch.xen_hvm_config.flags;
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+
return 0;
}
@@ -1374,12 +1430,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
return true;
}
+ /* A delta <= 0 results in an immediate callback, which is what we want */
delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
- if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
- *r = -ETIME;
- return true;
- }
-
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
*r = 0;
return true;
@@ -2043,7 +2095,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
if (ret < 0 && ret != -ENOTCONN)
return false;
} else {
- eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx);
}
*r = 0;
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f8f1fe22d..f5841d900 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue);
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
}
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index ea3a28e7b..f0dae4fb6 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,19 +14,6 @@ ifdef CONFIG_KCSAN
CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE)
endif
-# Early boot use of cmdline; don't instrument it
-ifdef CONFIG_AMD_MEM_ENCRYPT
-KCOV_INSTRUMENT_cmdline.o := n
-KASAN_SANITIZE_cmdline.o := n
-KCSAN_SANITIZE_cmdline.o := n
-
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_cmdline.o = -pg
-endif
-
-CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
-endif
-
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7c48ff4ae..7af743bd3 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
#include <linux/smp.h>
#include <linux/export.h>
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 23318c338..68f7fa3e1 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -21,10 +21,10 @@
* converted to pure assembler
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/asm.h>
-#include <asm/export.h>
#include <asm/nospec-branch.h>
/*
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f74a3e704..2760a15fb 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/export.h>
/*
* Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 49805257b..873e4ef23 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 30ea644bf..d6ae793d0 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,10 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
-#include <asm/export.h>
/*
* Some CPUs run faster using the string copy instructions (sane microcode).
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 0a81aafed..fc9fb5d06 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -6,11 +6,11 @@
* Functions to copy from and to user space.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
-#include <asm/export.h>
/*
* rep_movs_alternative - memory copy with exception handling.
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
index 5c5f38d32..2918e36ee 100644
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -3,9 +3,9 @@
* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/export.h>
/*
* copy_user_nocache - Uncached memory copy with exception handling
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 145f9a0bd..f4df4d241 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -14,8 +14,6 @@
* @src: source address (user space)
* @dst: destination address
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad source address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
* @src: source address
* @dst: destination address (user space)
* @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad destination address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
* @src: source address
* @dst: destination address
* @len: number of bytes to be copied.
- * @sum: initial sum that is added into the result (32bit unfolded)
*
* Returns an 32bit unfolded checksum of the buffer.
*/
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 0e65d00e2..23f81ca3f 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -128,7 +128,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles)
delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
/*
- * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
+ * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu
* variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 9c6371347..10d5ed8b5 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -26,6 +26,7 @@
* as they get called from within inline assembly.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/page_types.h>
#include <asm/errno.h>
@@ -33,7 +34,6 @@
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <asm/export.h>
#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
@@ -163,23 +163,23 @@ SYM_CODE_END(__get_user_8_handle_exception)
#endif
/* get_user */
- _ASM_EXTABLE(1b, __get_user_handle_exception)
- _ASM_EXTABLE(2b, __get_user_handle_exception)
- _ASM_EXTABLE(3b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(2b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(3b, __get_user_handle_exception)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE(4b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(4b, __get_user_handle_exception)
#else
- _ASM_EXTABLE(4b, __get_user_8_handle_exception)
- _ASM_EXTABLE(5b, __get_user_8_handle_exception)
+ _ASM_EXTABLE_UA(4b, __get_user_8_handle_exception)
+ _ASM_EXTABLE_UA(5b, __get_user_8_handle_exception)
#endif
/* __get_user */
- _ASM_EXTABLE(6b, __get_user_handle_exception)
- _ASM_EXTABLE(7b, __get_user_handle_exception)
- _ASM_EXTABLE(8b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(6b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(7b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(8b, __get_user_handle_exception)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE(9b, __get_user_handle_exception)
+ _ASM_EXTABLE_UA(9b, __get_user_handle_exception)
#else
- _ASM_EXTABLE(9b, __get_user_8_handle_exception)
- _ASM_EXTABLE(10b, __get_user_8_handle_exception)
+ _ASM_EXTABLE_UA(9b, __get_user_8_handle_exception)
+ _ASM_EXTABLE_UA(10b, __get_user_8_handle_exception)
#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 12c16c6aa..774bdf3e6 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
#include <asm/asm.h>
@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32)
SYM_FUNC_END(__sw_hweight32)
EXPORT_SYMBOL(__sw_hweight32)
-SYM_FUNC_START(__sw_hweight64)
+/*
+ * No 32-bit variant, because it's implemented as an inline wrapper
+ * on top of __arch_hweight32():
+ */
#ifdef CONFIG_X86_64
+SYM_FUNC_START(__sw_hweight64)
pushq %rdi
pushq %rdx
@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64)
popq %rdx
popq %rdi
RET
-#else /* CONFIG_X86_32 */
- /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
- pushl %ecx
-
- call __sw_hweight32
- movl %eax, %ecx # stash away result
- movl %edx, %eax # second part of input
- call __sw_hweight32
- addl %ecx, %eax # result
-
- popl %ecx
- RET
-#endif
SYM_FUNC_END(__sw_hweight64)
EXPORT_SYMBOL(__sw_hweight64)
+#endif
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 76697df8d..0ae2e1712 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright 2002 Andi Kleen */
+#include <linux/export.h>
#include <linux/linkage.h>
#include <linux/cfi_types.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
-#include <asm/export.h>
.section .noinstr.text, "ax"
diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S
index 0588b2c0f..35010ba3d 100644
--- a/arch/x86/lib/memmove_32.S
+++ b/arch/x86/lib/memmove_32.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
#include <linux/linkage.h>
-#include <asm/export.h>
SYM_FUNC_START(memmove)
/*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index ccdf3a597..1b60ae81e 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,10 +6,10 @@
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
-#include <asm/export.h>
#undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 3d818b849..0199d56cb 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,10 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2002 Andi Kleen, SuSE Labs */
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
-#include <asm/export.h>
.section .noinstr.text, "ax"
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 235bbda6f..975c9c182 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -11,13 +11,12 @@
* return an error value in addition to the "real"
* return value.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/thread_info.h>
#include <asm/errno.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <asm/export.h>
-
/*
* __put_user_X
@@ -134,15 +133,15 @@ SYM_CODE_START_LOCAL(__put_user_handle_exception)
RET
SYM_CODE_END(__put_user_handle_exception)
- _ASM_EXTABLE(1b, __put_user_handle_exception)
- _ASM_EXTABLE(2b, __put_user_handle_exception)
- _ASM_EXTABLE(3b, __put_user_handle_exception)
- _ASM_EXTABLE(4b, __put_user_handle_exception)
- _ASM_EXTABLE(5b, __put_user_handle_exception)
- _ASM_EXTABLE(6b, __put_user_handle_exception)
- _ASM_EXTABLE(7b, __put_user_handle_exception)
- _ASM_EXTABLE(9b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(1b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(2b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(3b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(4b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(5b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(6b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(7b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(9b, __put_user_handle_exception)
#ifdef CONFIG_X86_32
- _ASM_EXTABLE(8b, __put_user_handle_exception)
- _ASM_EXTABLE(10b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(8b, __put_user_handle_exception)
+ _ASM_EXTABLE_UA(10b, __put_user_handle_exception)
#endif
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index cd86aeb5f..5b26a952e 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
#include <linux/stringify.h>
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm-offsets.h>
-#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
@@ -126,11 +126,19 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
#include <asm/GEN-for-each-reg.h>
#undef GEN
#endif
+
+#ifdef CONFIG_RETHUNK
+
/*
- * This function name is magical and is used by -mfunction-return=thunk-extern
- * for the compiler to generate JMPs to it.
+ * Be careful here: that label cannot really be removed because in
+ * some configurations and toolchains, the JMP __x86_return_thunk the
+ * compiler issues is either a short one or the compiler doesn't use
+ * relocations for same-section JMPs and that breaks the returns
+ * detection logic in apply_returns() and in objtool.
*/
-#ifdef CONFIG_RETHUNK
+ .section .text..__x86.return_thunk
+
+#ifdef CONFIG_CPU_SRSO
/*
* srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at
@@ -147,10 +155,8 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
*
* As a result, srso_alias_safe_ret() becomes a safe return.
*/
-#ifdef CONFIG_CPU_SRSO
- .section .text..__x86.rethunk_untrain
-
-SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ .pushsection .text..__x86.rethunk_untrain
+SYM_CODE_START_NOALIGN(srso_alias_untrain_ret)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
ASM_NOP2
@@ -158,18 +164,10 @@ SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
jmp srso_alias_return_thunk
SYM_FUNC_END(srso_alias_untrain_ret)
__EXPORT_THUNK(srso_alias_untrain_ret)
+ .popsection
- .section .text..__x86.rethunk_safe
-#else
-/* dummy definition for alternatives */
-SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
- ANNOTATE_UNRET_SAFE
- ret
- int3
-SYM_FUNC_END(srso_alias_untrain_ret)
-#endif
-
-SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ .pushsection .text..__x86.rethunk_safe
+SYM_CODE_START_NOALIGN(srso_alias_safe_ret)
lea 8(%_ASM_SP), %_ASM_SP
UNWIND_HINT_FUNC
ANNOTATE_UNRET_SAFE
@@ -177,14 +175,69 @@ SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
int3
SYM_FUNC_END(srso_alias_safe_ret)
- .section .text..__x86.return_thunk
-
-SYM_CODE_START(srso_alias_return_thunk)
+SYM_CODE_START_NOALIGN(srso_alias_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
call srso_alias_safe_ret
ud2
SYM_CODE_END(srso_alias_return_thunk)
+ .popsection
+
+/*
+ * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
+ * above. On kernel entry, srso_untrain_ret() is executed which is a
+ *
+ * movabs $0xccccc30824648d48,%rax
+ *
+ * and when the return thunk executes the inner label srso_safe_ret()
+ * later, it is a stack manipulation and a RET which is mispredicted and
+ * thus a "safe" one to use.
+ */
+ .align 64
+ .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
+SYM_CODE_START_LOCAL_NOALIGN(srso_untrain_ret)
+ ANNOTATE_NOENDBR
+ .byte 0x48, 0xb8
+
+/*
+ * This forces the function return instruction to speculate into a trap
+ * (UD2 in srso_return_thunk() below). This RET will then mispredict
+ * and execution will continue at the return site read from the top of
+ * the stack.
+ */
+SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
+ lea 8(%_ASM_SP), %_ASM_SP
+ ret
+ int3
+ int3
+ /* end of movabs */
+ lfence
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_safe_ret)
+SYM_FUNC_END(srso_untrain_ret)
+
+SYM_CODE_START(srso_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ call srso_safe_ret
+ ud2
+SYM_CODE_END(srso_return_thunk)
+
+#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret"
+#else /* !CONFIG_CPU_SRSO */
+#define JMP_SRSO_UNTRAIN_RET "ud2"
+/* Dummy for the alternative in CALL_UNTRAIN_RET. */
+SYM_CODE_START(srso_alias_untrain_ret)
+ ANNOTATE_UNRET_SAFE
+ ANNOTATE_NOENDBR
+ ret
+ int3
+SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
+#endif /* CONFIG_CPU_SRSO */
+
+#ifdef CONFIG_CPU_UNRET_ENTRY
/*
* Some generic notes on the untraining sequences:
@@ -216,7 +269,7 @@ SYM_CODE_END(srso_alias_return_thunk)
*/
.align 64
.skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc
-SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_CODE_START_LOCAL_NOALIGN(retbleed_untrain_ret)
ANNOTATE_NOENDBR
/*
* As executed from retbleed_untrain_ret, this is:
@@ -264,72 +317,25 @@ SYM_CODE_END(retbleed_return_thunk)
jmp retbleed_return_thunk
int3
SYM_FUNC_END(retbleed_untrain_ret)
-__EXPORT_THUNK(retbleed_untrain_ret)
-/*
- * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
- * above. On kernel entry, srso_untrain_ret() is executed which is a
- *
- * movabs $0xccccc30824648d48,%rax
- *
- * and when the return thunk executes the inner label srso_safe_ret()
- * later, it is a stack manipulation and a RET which is mispredicted and
- * thus a "safe" one to use.
- */
- .align 64
- .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
-SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
- ANNOTATE_NOENDBR
- .byte 0x48, 0xb8
+#define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret"
+#else /* !CONFIG_CPU_UNRET_ENTRY */
+#define JMP_RETBLEED_UNTRAIN_RET "ud2"
+#endif /* CONFIG_CPU_UNRET_ENTRY */
-/*
- * This forces the function return instruction to speculate into a trap
- * (UD2 in srso_return_thunk() below). This RET will then mispredict
- * and execution will continue at the return site read from the top of
- * the stack.
- */
-SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
- lea 8(%_ASM_SP), %_ASM_SP
- ret
- int3
- int3
- /* end of movabs */
- lfence
- call srso_safe_ret
- ud2
-SYM_CODE_END(srso_safe_ret)
-SYM_FUNC_END(srso_untrain_ret)
-__EXPORT_THUNK(srso_untrain_ret)
-
-SYM_CODE_START(srso_return_thunk)
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
- call srso_safe_ret
- ud2
-SYM_CODE_END(srso_return_thunk)
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
SYM_FUNC_START(entry_untrain_ret)
- ALTERNATIVE_2 "jmp retbleed_untrain_ret", \
- "jmp srso_untrain_ret", X86_FEATURE_SRSO, \
- "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
+ ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
-SYM_CODE_START(__x86_return_thunk)
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
- ANNOTATE_UNRET_SAFE
- ret
- int3
-SYM_CODE_END(__x86_return_thunk)
-EXPORT_SYMBOL(__x86_return_thunk)
-
-#endif /* CONFIG_RETHUNK */
+#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */
#ifdef CONFIG_CALL_DEPTH_TRACKING
.align 64
-SYM_FUNC_START(__x86_return_skl)
+SYM_FUNC_START(call_depth_return_thunk)
ANNOTATE_NOENDBR
/*
* Keep the hotpath in a 16byte I-fetch for the non-debug
@@ -356,6 +362,33 @@ SYM_FUNC_START(__x86_return_skl)
ANNOTATE_UNRET_SAFE
ret
int3
-SYM_FUNC_END(__x86_return_skl)
+SYM_FUNC_END(call_depth_return_thunk)
#endif /* CONFIG_CALL_DEPTH_TRACKING */
+
+/*
+ * This function name is magical and is used by -mfunction-return=thunk-extern
+ * for the compiler to generate JMPs to it.
+ *
+ * This code is only used during kernel boot or module init. All
+ * 'JMP __x86_return_thunk' sites are changed to something else by
+ * apply_returns().
+ *
+ * This should be converted eventually to call a warning function which
+ * should scream loudly when the default return thunk is called after
+ * alternatives have been applied.
+ *
+ * That warning function cannot BUG() because the bug splat cannot be
+ * displayed in all possible configurations, leading to users not really
+ * knowing why the machine froze.
+ */
+SYM_CODE_START(__x86_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_CODE_END(__x86_return_thunk)
+EXPORT_SYMBOL(__x86_return_thunk)
+
+#endif /* CONFIG_RETHUNK */
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 5168ee036..d1ccd06c5 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -148,7 +148,7 @@ AVXcode:
65: SEG=GS (Prefix)
66: Operand-Size (Prefix)
67: Address-Size (Prefix)
-68: PUSH Iz (d64)
+68: PUSH Iz
69: IMUL Gv,Ev,Iz
6a: PUSH Ib (d64)
6b: IMUL Gv,Ev,Ib
@@ -698,10 +698,10 @@ AVXcode: 2
4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev)
4e: vrsqrt14ps/d Vpd,Wpd (66),(ev)
4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev)
-50: vpdpbusd Vx,Hx,Wx (66),(ev)
-51: vpdpbusds Vx,Hx,Wx (66),(ev)
-52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
-53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
+50: vpdpbusd Vx,Hx,Wx (66)
+51: vpdpbusds Vx,Hx,Wx (66)
+52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
+53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
54: vpopcntb/w Vx,Wx (66),(ev)
55: vpopcntd/q Vx,Wx (66),(ev)
58: vpbroadcastd Vx,Wx (66),(v)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ab778eac1..e604d2d6c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -376,7 +376,7 @@ static void dump_pagetable(unsigned long address)
goto bad;
pr_cont("PUD %lx ", pud_val(*pud));
- if (!pud_present(*pud) || pud_large(*pud))
+ if (!pud_present(*pud) || pud_leaf(*pud))
goto out;
pmd = pmd_offset(pud, address);
@@ -717,39 +717,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
@@ -798,15 +767,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
-/*
- * The (legacy) vsyscall page is the long page in the kernel portion
- * of the address space that has user-accessible permissions.
- */
-static bool is_vsyscall_vaddr(unsigned long vaddr)
-{
- return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
-}
-
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 pkey, int si_code)
@@ -1046,7 +1006,7 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address)
if (!pud_present(*pud))
return 0;
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
@@ -1370,6 +1330,8 @@ void do_user_addr_fault(struct pt_regs *regs,
goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a190aae8c..534436c9d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -617,7 +617,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
}
if (!pud_none(*pud)) {
- if (!pud_large(*pud)) {
+ if (!pud_leaf(*pud)) {
pmd = pmd_offset(pud, 0);
paddr_last = phys_pmd_init(pmd, paddr,
paddr_end,
@@ -1013,7 +1013,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
return;
}
- /* free a pte talbe */
+ /* free a pte table */
free_pagetable(pmd_page(*pmd), 0);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
@@ -1031,7 +1031,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
return;
}
- /* free a pmd talbe */
+ /* free a pmd table */
free_pagetable(pud_page(*pud), 0);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
@@ -1049,7 +1049,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
return;
}
- /* free a pud talbe */
+ /* free a pud table */
free_pagetable(p4d_page(*p4d), 0);
spin_lock(&init_mm.page_table_lock);
p4d_clear(p4d);
@@ -1163,7 +1163,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!pud_present(*pud))
continue;
- if (pud_large(*pud) &&
+ if (pud_leaf(*pud) &&
IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE)) {
spin_lock(&init_mm.page_table_lock);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0302491d7..fcf508c52 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -115,7 +115,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (!pud_large(*pud))
+ if (!pud_leaf(*pud))
kasan_populate_pud(pud, addr, next, nid);
} while (pud++, addr = next, addr != end);
}
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
index 6993f026a..42115ac07 100644
--- a/arch/x86/mm/maccess.c
+++ b/arch/x86/mm/maccess.c
@@ -3,6 +3,8 @@
#include <linux/uaccess.h>
#include <linux/kernel.h>
+#include <asm/vsyscall.h>
+
#ifdef CONFIG_X86_64
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
@@ -16,6 +18,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
return false;
/*
+ * Reading from the vsyscall page may cause an unhandled fault in
+ * certain cases. Though it is at an address above TASK_SIZE_MAX, it is
+ * usually considered as a user space address.
+ */
+ if (is_vsyscall_vaddr(vaddr))
+ return false;
+
+ /*
* Allow everything during early boot before 'x86_virt_bits'
* is initialized. Needed for instruction decoding in early
* exception handlers.
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9f27e14e1..c290c55b6 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,7 @@
#include <linux/swiotlb.h>
#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
+#include <linux/virtio_anchor.h>
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -86,3 +87,36 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
+
+void __init mem_encrypt_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV and TDX, all DMA has to occur via shared/unencrypted pages.
+ * Kernel uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB using a percentage of guest
+ * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer
+ * memory is allocated from low memory, ensure that the adjusted size
+ * is within the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+
+ /* Set restricted memory access for virtio. */
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 45ff95264..94cd06d4b 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -19,8 +19,6 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_anchor.h>
#include <linux/cc_platform.h>
#include <asm/tlbflush.h>
@@ -216,40 +214,6 @@ void __init sme_map_bootdata(char *real_mode_data)
__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
}
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-
- /* Set restricted memory access for virtio. */
- virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
-}
-
static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
{
unsigned long pfn = 0;
@@ -528,6 +492,24 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ENABLED)
ia32_disable();
+
+ /*
+ * Override init functions that scan the ROM region in SEV-SNP guests,
+ * as this memory is not pre-validated and would thus cause a crash.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.pci.init_irq = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+
+ /*
+ * DMI setup behavior for SEV-SNP guests depends on
+ * efi_enabled(EFI_CONFIG_TABLES), which hasn't been
+ * parsed yet. snp_dmi_setup() will run after that
+ * parsing has happened.
+ */
+ x86_init.resources.dmi_setup = snp_dmi_setup;
+ }
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index d73aeb164..cc47a818a 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -41,9 +41,9 @@
#include <linux/mem_encrypt.h>
#include <linux/cc_platform.h>
+#include <asm/init.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include <asm/cmdline.h>
#include <asm/coco.h>
#include <asm/sev.h>
@@ -95,11 +95,7 @@ struct sme_populate_pgd_data {
*/
static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
-static char sme_cmdline_arg[] __initdata = "mem_encrypt";
-static char sme_cmdline_on[] __initdata = "on";
-static char sme_cmdline_off[] __initdata = "off";
-
-static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
unsigned long pgd_start, pgd_end, pgd_size;
pgd_t *pgd_p;
@@ -114,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
memset(pgd_p, 0, pgd_size);
}
-static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -145,13 +141,13 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
}
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return NULL;
return pud;
}
-static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -167,7 +163,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
}
-static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -193,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
}
-static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
@@ -203,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd(ppd);
@@ -213,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
pmdval_t pmd_flags, pteval_t pte_flags)
{
unsigned long vaddr_end;
@@ -237,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
__sme_map_range_pte(ppd);
}
-static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}
-static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
}
-static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
}
-static unsigned long __init sme_pgtable_calc(unsigned long len)
+static unsigned long __head sme_pgtable_calc(unsigned long len)
{
unsigned long entries = 0, tables = 0;
@@ -289,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return entries + tables;
}
-void __init sme_encrypt_kernel(struct boot_params *bp)
+void __head sme_encrypt_kernel(struct boot_params *bp)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
@@ -305,7 +301,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* instrumentation or checking boot_cpu_data in the cc_platform_has()
* function.
*/
- if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
+ if (!sme_get_me_mask() ||
+ RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED)
return;
/*
@@ -323,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* memory from being cached.
*/
- /* Physical addresses gives us the identity mapped virtual addresses */
- kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
+ kernel_start = (unsigned long)RIP_REL_REF(_text);
+ kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -343,14 +339,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
#endif
/*
- * We're running identity mapped, so we must obtain the address to the
- * SME encryption workarea using rip-relative addressing.
- */
- asm ("lea sme_workarea(%%rip), %0"
- : "=r" (workarea_start)
- : "p" (sme_workarea));
-
- /*
* Calculate required number of workarea bytes needed:
* executable encryption area size:
* stack page (PAGE_SIZE)
@@ -359,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
- execute_start = workarea_start;
+ execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea);
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
@@ -502,14 +490,11 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
native_write_cr3(__native_read_cr3());
}
-void __init sme_enable(struct boot_params *bp)
+void __head sme_enable(struct boot_params *bp)
{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
unsigned int eax, ebx, ecx, edx;
unsigned long feature_mask;
- bool active_by_default;
unsigned long me_mask;
- char buffer[16];
bool snp;
u64 msr;
@@ -543,15 +528,18 @@ void __init sme_enable(struct boot_params *bp)
me_mask = 1UL << (ebx & 0x3f);
/* Check the SEV MSR whether SEV or SME is enabled */
- sev_status = __rdmsr(MSR_AMD64_SEV);
- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+ RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
- if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED))
snp_abort();
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION))
+ return;
+
/*
* No SME if Hypervisor bit is set. This check is here to
* prevent a guest from trying to enable SME. For running as a
@@ -571,48 +559,10 @@ void __init sme_enable(struct boot_params *bp)
msr = __rdmsr(MSR_AMD64_SYSCFG);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
- } else {
- /* SEV state cannot be controlled by a command line option */
- sme_me_mask = me_mask;
- goto out;
}
- /*
- * Fixups have not been applied to phys_base yet and we're running
- * identity mapped, so we must obtain the address to the SME command
- * line argument data using rip-relative addressing.
- */
- asm ("lea sme_cmdline_arg(%%rip), %0"
- : "=r" (cmdline_arg)
- : "p" (sme_cmdline_arg));
- asm ("lea sme_cmdline_on(%%rip), %0"
- : "=r" (cmdline_on)
- : "p" (sme_cmdline_on));
- asm ("lea sme_cmdline_off(%%rip), %0"
- : "=r" (cmdline_off)
- : "p" (sme_cmdline_off));
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
- active_by_default = true;
- else
- active_by_default = false;
-
- cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
- ((u64)bp->ext_cmd_line_ptr << 32));
-
- if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
- return;
-
- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
- sme_me_mask = me_mask;
- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
- sme_me_mask = 0;
- else
- sme_me_mask = active_by_default ? me_mask : 0;
-out:
- if (sme_me_mask) {
- physical_mask &= ~sme_me_mask;
- cc_vendor = CC_VENDOR_AMD;
- cc_set_mask(sme_me_mask);
- }
+ RIP_REL_REF(sme_me_mask) = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
+ cc_set_mask(me_mask);
}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa39d678f..ce84ba86e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -3,6 +3,7 @@
#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/of.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/memblock.h>
@@ -57,7 +58,7 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
int numa_cpu_node(int cpu)
{
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
if (apicid != BAD_APICID)
return __apicid_to_node[apicid];
@@ -449,37 +450,6 @@ int __node_distance(int from, int to)
EXPORT_SYMBOL(__node_distance);
/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common). Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
- u64 numaram, e820ram;
- int i;
-
- numaram = 0;
- for (i = 0; i < mi->nr_blks; i++) {
- u64 s = mi->blk[i].start >> PAGE_SHIFT;
- u64 e = mi->blk[i].end >> PAGE_SHIFT;
- numaram += e - s;
- numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
- if ((s64)numaram < 0)
- numaram = 0;
- }
-
- e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
-
- /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
- if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
- printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
- (numaram << PAGE_SHIFT) >> 20,
- (e820ram << PAGE_SHIFT) >> 20);
- return false;
- }
- return true;
-}
-
-/*
* Mark all currently memblock-reserved physical memory (which covers the
* kernel's own memory ranges) as hot-unswappable.
*/
@@ -584,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
}
}
- if (!numa_meminfo_cover_memory(mi))
+
+ if (!memblock_validate_numa_coverage(SZ_1M))
return -EINVAL;
/* Finally register nodes. */
@@ -727,6 +698,8 @@ void __init x86_numa_init(void)
if (!numa_init(amd_numa_init))
return;
#endif
+ if (acpi_disabled && !numa_init(of_numa_init))
+ return;
}
numa_init(dummy_numa_init);
@@ -780,7 +753,7 @@ void __init init_gi_nodes(void)
void __init init_cpu_to_node(void)
{
int cpu;
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+ u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
BUG_ON(cpu_to_apicid == NULL);
@@ -956,12 +929,14 @@ int memory_add_physaddr_to_nid(u64 start)
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
static int __init cmp_memblk(const void *a, const void *b)
{
const struct numa_memblk *ma = *(const struct numa_memblk **)a;
const struct numa_memblk *mb = *(const struct numa_memblk **)b;
- return ma->start - mb->start;
+ return (ma->start > mb->start) - (ma->start < mb->start);
}
static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
@@ -971,14 +946,12 @@ static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
* @start: address to begin fill
* @end: address to end fill
*
- * Find and extend numa_meminfo memblks to cover the @start-@end
- * physical address range, such that the first memblk includes
- * @start, the last memblk includes @end, and any gaps in between
- * are filled.
+ * Find and extend numa_meminfo memblks to cover the physical
+ * address range @start-@end
*
* RETURNS:
* 0 : Success
- * NUMA_NO_MEMBLK : No memblk exists in @start-@end range
+ * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
*/
int __init numa_fill_memblks(u64 start, u64 end)
@@ -990,17 +963,14 @@ int __init numa_fill_memblks(u64 start, u64 end)
/*
* Create a list of pointers to numa_meminfo memblks that
- * overlap start, end. Exclude (start == bi->end) since
- * end addresses in both a CFMWS range and a memblk range
- * are exclusive.
- *
- * This list of pointers is used to make in-place changes
- * that fill out the numa_meminfo memblks.
+ * overlap start, end. The list is used to make in-place
+ * changes that fill out the numa_meminfo memblks.
*/
for (int i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
- if (start < bi->end && end >= bi->start) {
+ if (memblock_addrs_overlap(start, end - start, bi->start,
+ bi->end - bi->start)) {
blk[count] = &mi->blk[i];
count++;
}
@@ -1033,5 +1003,3 @@ int __init numa_fill_memblks(u64 start, u64 end)
}
return 0;
}
-
-#endif
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index de10800cd..37d51cfd7 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -14,7 +14,7 @@
* memory ranges: uncached, write-combining, write-through, write-protected,
* and the most commonly used and default attribute: write-back caching.
*
- * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
+ * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is
* a hardware interface to enumerate a limited number of physical memory ranges
* and set their caching attributes explicitly, programmed into the CPU via MSRs.
* Even modern CPUs have MTRRs enabled - but these are typically not touched
@@ -950,6 +950,38 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
+ pgprot_t *pgprot)
+{
+ unsigned long prot;
+
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
+
+ /*
+ * We need the starting PFN and cachemode used for track_pfn_remap()
+ * that covered the whole VMA. For most mappings, we can obtain that
+ * information from the page tables. For COW mappings, we might now
+ * suddenly have anon folios mapped and follow_phys() will fail.
+ *
+ * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
+ * detect the PFN. If we need the cachemode as well, we're out of luck
+ * for now and have to fail fork().
+ */
+ if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (pgprot)
+ *pgprot = __pgprot(prot);
+ return 0;
+ }
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (pgprot)
+ return -EINVAL;
+ *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
@@ -960,20 +992,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
- unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (vma->vm_flags & VM_PAT) {
- /*
- * reserve the whole chunk covered by vma. We need the
- * starting address and protection from pte.
- */
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, &pgprot))
return -EINVAL;
- }
- pgprot = __pgprot(prot);
+ /* reserve the whole chunk covered by vma. */
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
@@ -1048,7 +1073,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
- unsigned long prot;
if (vma && !(vma->vm_flags & VM_PAT))
return;
@@ -1056,11 +1080,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, NULL))
return;
- }
-
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index bda9f1298..b4073fb45 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -619,7 +619,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +642,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,20 +662,26 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry, the level of the mapping, and the effective
+ * NX and RW bits of all page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
*level = PG_LEVEL_NONE;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
@@ -679,14 +690,20 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (p4d_large(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
*level = PG_LEVEL_1G;
- if (pud_large(*pud) || !pud_present(*pud))
+ if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
@@ -695,12 +712,27 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pmd_large(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
+
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
@@ -715,13 +747,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -743,7 +778,7 @@ pmd_t *lookup_pmd_address(unsigned long address)
return NULL;
pud = pud_offset(p4d, address);
- if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
return NULL;
return pmd_offset(pud, address);
@@ -755,10 +790,14 @@ pmd_t *lookup_pmd_address(unsigned long address)
* areas on 32-bit NUMA systems. The percpu areas can
* end up in this kind of memory, for instance.
*
- * This could be optimized, but it is only intended to be
- * used at initialization time, and keeping it
- * unoptimized should increase the testing coverage for
- * the more obscure platforms.
+ * Note that as long as the PTEs are well-formed with correct PFNs, this
+ * works without checking the PRESENT bit in the leaf PTE. This is unlike
+ * the similar vmalloc_to_page() and derivatives. Callers may depend on
+ * this behavior.
+ *
+ * This could be optimized, but it is only used in paths that are not perf
+ * sensitive, and keeping it unoptimized should increase the testing coverage
+ * for the more obscure platforms.
*/
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
@@ -845,12 +884,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -961,7 +1001,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1042,6 +1083,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1049,7 +1091,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1274,7 +1316,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
*/
while (end - start >= PUD_SIZE) {
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
@@ -1590,10 +1632,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1615,13 +1658,14 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
/*
* We need to keep the pfn from the existing PTE,
- * after all we're only going to change it's attributes
+ * after all we're only going to change its attributes
* not the memory it points to
*/
new_pte = pfn_pte(pfn, new_prot);
@@ -2041,17 +2085,12 @@ int set_mce_nospec(unsigned long pfn)
return rc;
}
-static int set_memory_p(unsigned long *addr, int numpages)
-{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
-}
-
/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
{
unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
- return set_memory_p(&addr, 1);
+ return set_memory_p(addr, 1);
}
EXPORT_SYMBOL_GPL(clear_mce_nospec);
#endif /* CONFIG_X86_64 */
@@ -2104,6 +2143,11 @@ int set_memory_np_noalias(unsigned long addr, int numpages)
CPA_NO_CHECK_ALIAS, NULL);
}
+int set_memory_p(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
@@ -2447,7 +2491,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
/*
* The typical sequence for unmapping is to find a pte through
* lookup_address_in_pgd() (ideally, it should never return NULL because
- * the address is already mapped) and change it's protections. As pfn is
+ * the address is already mapped) and change its protections. As pfn is
* the *target* of a mapping, it's not useful while unmapping.
*/
struct cpa_data cpa = {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 9deadf517..a67bb8f98 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -76,6 +76,9 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pud);
+
+ pagetable_pud_dtor(ptdesc);
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
@@ -774,7 +777,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
*/
int pud_clear_huge(pud_t *pud)
{
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pud_clear(pud);
return 1;
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 78414c6d1..912b1da7e 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -6,7 +6,7 @@
*
* https://github.com/IAIK/KAISER
*
- * The original work was written by and and signed off by for the Linux
+ * The original work was written by and signed off by for the Linux
* kernel by:
*
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
@@ -69,6 +69,7 @@ static void __init pti_print_if_secure(const char *reason)
pr_info("%s\n", reason);
}
+/* Assume mode is auto unless overridden via cmdline below. */
static enum pti_mode {
PTI_AUTO = 0,
PTI_FORCE_OFF,
@@ -77,50 +78,49 @@ static enum pti_mode {
void __init pti_check_boottime_disable(void)
{
- char arg[5];
- int ret;
-
- /* Assume mode is auto unless overridden. */
- pti_mode = PTI_AUTO;
-
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on XEN PV.");
return;
}
- ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
- if (ret > 0) {
- if (ret == 3 && !strncmp(arg, "off", 3)) {
- pti_mode = PTI_FORCE_OFF;
- pti_print_if_insecure("disabled on command line.");
- return;
- }
- if (ret == 2 && !strncmp(arg, "on", 2)) {
- pti_mode = PTI_FORCE_ON;
- pti_print_if_secure("force enabled on command line.");
- goto enable;
- }
- if (ret == 4 && !strncmp(arg, "auto", 4)) {
- pti_mode = PTI_AUTO;
- goto autosel;
- }
- }
-
- if (cmdline_find_option_bool(boot_command_line, "nopti") ||
- cpu_mitigations_off()) {
+ if (cpu_mitigations_off())
pti_mode = PTI_FORCE_OFF;
+ if (pti_mode == PTI_FORCE_OFF) {
pti_print_if_insecure("disabled on command line.");
return;
}
-autosel:
- if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ if (pti_mode == PTI_FORCE_ON)
+ pti_print_if_secure("force enabled on command line.");
+
+ if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return;
-enable:
+
setup_force_cpu_cap(X86_FEATURE_PTI);
}
+static int __init pti_parse_cmdline(char *arg)
+{
+ if (!strcmp(arg, "off"))
+ pti_mode = PTI_FORCE_OFF;
+ else if (!strcmp(arg, "on"))
+ pti_mode = PTI_FORCE_ON;
+ else if (!strcmp(arg, "auto"))
+ pti_mode = PTI_AUTO;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("pti", pti_parse_cmdline);
+
+static int __init pti_parse_cmdline_nopti(char *arg)
+{
+ pti_mode = PTI_FORCE_OFF;
+ return 0;
+}
+early_param("nopti", pti_parse_cmdline_nopti);
+
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
/*
@@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
pud = pud_offset(p4d, address);
/* The user page tables do not use large mappings: */
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
WARN_ON(1);
return NULL;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 453ea95b6..5768d386e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -355,7 +355,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
/*
* Validate that it is not running on an SMT sibling as this would
- * make the excercise pointless because the siblings share L1D. If
+ * make the exercise pointless because the siblings share L1D. If
* it runs on a SMT sibling, notify it with SIGBUS on return to
* user/guest
*/
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 955133077..f415c2cf5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -16,6 +16,10 @@
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
+#include <asm/unwind.h>
+#include <asm/cfi.h>
+
+static bool all_callee_regs_used[4] = {true, true, true, true};
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -48,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
#ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4)
#else
#define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
#endif
static bool is_imm8(int value)
@@ -255,6 +261,14 @@ struct jit_context {
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
+static void push_r12(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x54); /* push r12 */
+ *pprog = prog;
+}
+
static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
@@ -270,6 +284,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
*pprog = prog;
}
+static void pop_r12(u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ EMIT2(0x41, 0x5C); /* pop r12 */
+ *pprog = prog;
+}
+
static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
@@ -285,30 +307,130 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
*pprog = prog;
}
+static void emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(x86_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+}
+
+/*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+static void emit_fineibt(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT_ENDBR();
+ EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
+ EMIT2(0x74, 0x07); /* jz.d8 +7 */
+ EMIT2(0x0f, 0x0b); /* ud2 */
+ EMIT1(0x90); /* nop */
+ EMIT_ENDBR_POISON();
+
+ *pprog = prog;
+}
+
+static void emit_kcfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT1_off32(0xb8, hash); /* movl $hash, %eax */
+#ifdef CONFIG_CALL_PADDING
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+#endif
+ EMIT_ENDBR();
+
+ *pprog = prog;
+}
+
+static void emit_cfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ emit_fineibt(&prog, hash);
+ break;
+
+ case CFI_KCFI:
+ emit_kcfi(&prog, hash);
+ break;
+
+ default:
+ EMIT_ENDBR();
+ break;
+ }
+
+ *pprog = prog;
+}
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
* while jumping to another program
*/
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
- bool tail_call_reachable, bool is_subprog)
+ bool tail_call_reachable, bool is_subprog,
+ bool is_exception_cb)
{
u8 *prog = *pprog;
+ emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- EMIT_ENDBR();
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
+ /* When it's the entry of the whole tailcall context,
+ * zeroing rax means initialising tail_call_cnt.
+ */
EMIT2(0x31, 0xC0); /* xor eax, eax */
else
+ /* Keep the same instruction layout. */
EMIT2(0x66, 0x90); /* nop2 */
}
- EMIT1(0x55); /* push rbp */
- EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ /* Exception callback receives FP as third parameter */
+ if (is_exception_cb) {
+ EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
+ EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
+ /* The main frame must have exception_boundary as true, so we
+ * first restore those callee-saved regs from stack, before
+ * reusing the stack frame.
+ */
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ /* Reset the stack frame. */
+ EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
+ } else {
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ }
/* X86_TAIL_CALL_OFFSET is here */
EMIT_ENDBR();
@@ -344,7 +466,7 @@ static int emit_call(u8 **pprog, void *func, void *ip)
static int emit_rsb_call(u8 **pprog, void *func, void *ip)
{
OPTIMIZER_HIDE_VAR(func);
- x86_call_depth_emit_accounting(pprog, func);
+ ip += x86_call_depth_emit_accounting(pprog, func);
return emit_patch(pprog, func, ip, 0xE8);
}
@@ -467,7 +589,8 @@ static void emit_return(u8 **pprog, u8 *ip)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
-static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
+static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+ u8 **pprog, bool *callee_regs_used,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
@@ -517,7 +640,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JE, offset); /* je out */
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
EMIT1(0x58); /* pop rax */
if (stack_depth)
@@ -541,7 +669,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
*pprog = prog;
}
-static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
+ struct bpf_jit_poke_descriptor *poke,
u8 **pprog, u8 *ip,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
@@ -570,13 +699,18 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
poke->tailcall_bypass);
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
+
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
/* out: */
ctx->tail_call_direct_label = prog - start;
@@ -938,25 +1072,6 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
-static void emit_nops(u8 **pprog, int len)
-{
- u8 *prog = *pprog;
- int i, noplen;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(x86_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
/* emit the 3-byte VEX prefix
*
* r: same as rex.r, extra bit for ModRM reg field
@@ -1045,8 +1160,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_prologue(&prog, bpf_prog->aux->stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
- bpf_prog->aux->func_idx != 0);
- push_callee_regs(&prog, callee_regs_used);
+ bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+ /* Exception callback will clobber callee regs for its own use, and
+ * restore the original callee regs from main prog's stack frame.
+ */
+ if (bpf_prog->aux->exception_boundary) {
+ /* We also need to save r12, which is not mapped to any BPF
+ * register, as we throw after entry into the kernel, which may
+ * overwrite r12.
+ */
+ push_r12(&prog);
+ push_callee_regs(&prog, all_callee_regs_used);
+ } else {
+ push_callee_regs(&prog, callee_regs_used);
+ }
ilen = prog - temp;
if (rw_image)
@@ -1458,36 +1585,41 @@ st: if (is_imm8(insn->off))
if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
/* Conservatively check that src_reg + insn->off is a kernel address:
- * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
- * src_reg is used as scratch for src_reg += insn->off and restored
- * after emit_ldx if necessary
+ * src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE
+ * and
+ * src_reg + insn->off < VSYSCALL_ADDR
*/
- u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR;
u8 *end_of_jmp;
- /* At end of these emitted checks, insn->off will have been added
- * to src_reg, so no need to do relative load with insn->off offset
- */
- insn_off = 0;
+ /* movabsq r10, VSYSCALL_ADDR */
+ emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32,
+ (u32)(long)VSYSCALL_ADDR);
- /* movabsq r11, limit */
- EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
- EMIT((u32)limit, 4);
- EMIT(limit >> 32, 4);
+ /* mov src_reg, r11 */
+ EMIT_mov(AUX_REG, src_reg);
if (insn->off) {
- /* add src_reg, insn->off */
- maybe_emit_1mod(&prog, src_reg, true);
- EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
}
- /* cmp src_reg, r11 */
- maybe_emit_mod(&prog, src_reg, AUX_REG, true);
- EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* sub r11, r10 */
+ maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
+ EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
+
+ /* movabsq r10, limit */
+ emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32,
+ (u32)(long)limit);
- /* if unsigned '>=', goto load */
- EMIT2(X86_JAE, 0);
+ /* cmp r10, r11 */
+ maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
+ EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
+
+ /* if unsigned '>', goto load */
+ EMIT2(X86_JA, 0);
end_of_jmp = prog;
/* xor dst_reg, dst_reg */
@@ -1513,18 +1645,6 @@ st: if (is_imm8(insn->off))
/* populate jmp_offset for JMP above */
start_of_ldx[-1] = prog - start_of_ldx;
- if (insn->off && src_reg != dst_reg) {
- /* sub src_reg, insn->off
- * Restore src_reg after "add src_reg, insn->off" in prev
- * if statement. But if src_reg == dst_reg, emit_ldx
- * above already clobbered src_reg, so no need to restore.
- * If add src_reg, insn->off was unnecessary, no need to
- * restore either.
- */
- maybe_emit_1mod(&prog, src_reg, true);
- EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
- }
-
if (!bpf_prog->aux->extable)
break;
@@ -1643,13 +1763,15 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
- emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+ emit_bpf_tail_call_direct(bpf_prog,
+ &bpf_prog->aux->poke_tab[imm32 - 1],
&prog, image + addrs[i - 1],
callee_regs_used,
bpf_prog->aux->stack_depth,
ctx);
else
- emit_bpf_tail_call_indirect(&prog,
+ emit_bpf_tail_call_indirect(bpf_prog,
+ &prog,
callee_regs_used,
bpf_prog->aux->stack_depth,
image + addrs[i - 1],
@@ -1902,7 +2024,12 @@ emit_jmp:
seen_exit = true;
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
- pop_callee_regs(&prog, callee_regs_used);
+ if (bpf_prog->aux->exception_boundary) {
+ pop_callee_regs(&prog, all_callee_regs_used);
+ pop_r12(&prog);
+ } else {
+ pop_callee_regs(&prog, callee_regs_used);
+ }
EMIT1(0xC9); /* leave */
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
break;
@@ -2073,7 +2200,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog,
} else {
/* Only copy the arguments on-stack to current
* 'stack_size' and ignore the regs, used to
- * prepare the arguments on-stack for orign call.
+ * prepare the arguments on-stack for origin call.
*/
if (for_call_origin) {
nr_regs += arg_regs;
@@ -2128,7 +2255,8 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog,
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
u8 *jmp_insn;
@@ -2156,7 +2284,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
else
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
@@ -2180,7 +2308,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_rsb_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
return -EINVAL;
/*
@@ -2207,7 +2335,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
else
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
*pprog = prog;
@@ -2242,14 +2370,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
int i;
u8 *prog = *pprog;
for (i = 0; i < tl->nr_links; i++) {
if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
- run_ctx_off, save_ret))
+ run_ctx_off, save_ret, image, rw_image))
return -EINVAL;
}
*pprog = prog;
@@ -2258,7 +2387,8 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, u8 **branches)
+ int run_ctx_off, u8 **branches,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
int i;
@@ -2269,7 +2399,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
for (i = 0; i < tl->nr_links; i++) {
- if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
+ image, rw_image))
return -EINVAL;
/* mod_ret prog stored return value into [rbp - 8]. Emit:
@@ -2352,10 +2483,11 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
- const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks,
- void *func_addr)
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
+ void *rw_image_end, void *image,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
@@ -2367,10 +2499,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
u8 *prog;
bool save_ret;
+ /*
+ * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+ * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+ * because @func_addr.
+ */
+ WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+ (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
/* extra registers for struct arguments */
- for (i = 0; i < m->nr_args; i++)
+ for (i = 0; i < m->nr_args; i++) {
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+ }
/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
* are passed through regs, the remains are through stack.
@@ -2451,22 +2592,29 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
orig_call += X86_PATCH_SIZE;
}
- prog = image;
+ prog = rw_image;
- EMIT_ENDBR();
- /*
- * This is the direct-call trampoline, as such it needs accounting
- * for the __fentry__ call.
- */
- x86_call_depth_emit_accounting(&prog, NULL);
+ if (flags & BPF_TRAMP_F_INDIRECT) {
+ /*
+ * Indirect call for bpf_struct_ops
+ */
+ emit_cfi(&prog, cfi_get_func_hash(func_addr));
+ } else {
+ /*
+ * Direct-call fentry stub, as such it needs accounting for the
+ * __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
+ }
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- if (!is_imm8(stack_size))
+ if (!is_imm8(stack_size)) {
/* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
- else
+ } else {
/* sub rsp, stack_size */
EMIT4(0x48, 0x83, 0xEC, stack_size);
+ }
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
EMIT1(0x50); /* push rax */
/* mov QWORD PTR [rbp - rbx_off], rbx */
@@ -2493,16 +2641,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter,
+ image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
- if (fentry->nr_links)
+ if (fentry->nr_links) {
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
- flags & BPF_TRAMP_F_RET_FENTRY_RET))
+ flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
return -EINVAL;
+ }
if (fmod_ret->nr_links) {
branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
@@ -2511,7 +2661,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
return -ENOMEM;
if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
- run_ctx_off, branches)) {
+ run_ctx_off, branches, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2521,27 +2671,27 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
restore_regs(m, &prog, regs_off);
save_args(m, &prog, arg_stack_off, true);
- if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before calling the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
EMIT2(0xff, 0xd3); /* call *rbx */
} else {
/* call original function */
- if (emit_rsb_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- im->ip_after_call = prog;
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ im->ip_after_call = image + (prog - (u8 *)rw_image);
+ emit_nops(&prog, X86_PATCH_SIZE);
}
if (fmod_ret->nr_links) {
@@ -2554,16 +2704,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
/* Update the branches saved in invoke_bpf_mod_ret with the
* aligned address of do_fexit.
*/
- for (i = 0; i < fmod_ret->nr_links; i++)
- emit_cond_near_jump(&branches[i], prog, branches[i],
- X86_JNE);
+ for (i = 0; i < fmod_ret->nr_links; i++) {
+ emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
+ image + (branches[i] - (u8 *)rw_image), X86_JNE);
+ }
}
- if (fexit->nr_links)
- if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
+ if (fexit->nr_links) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
+ false, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
+ }
if (flags & BPF_TRAMP_F_RESTORE_REGS)
restore_regs(m, &prog, regs_off);
@@ -2573,18 +2726,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
* restored to R0.
*/
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = prog;
+ im->ip_epilogue = image + (prog - (u8 *)rw_image);
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
- } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before running the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
/* restore return value of orig_call or fentry prog back into RAX */
if (save_ret)
@@ -2592,22 +2746,94 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
- if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
- emit_return(&prog, prog);
+ }
+ emit_return(&prog, image + (prog - (u8 *)rw_image));
/* Make sure the trampoline generation logic doesn't overflow */
- if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
+ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
ret = -EFAULT;
goto cleanup;
}
- ret = prog - (u8 *)image;
+ ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
cleanup:
kfree(branches);
return ret;
}
+void *arch_alloc_bpf_trampoline(unsigned int size)
+{
+ return bpf_prog_pack_alloc(size, jit_fill_hole);
+}
+
+void arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ bpf_prog_pack_free(image, size);
+}
+
+void arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ void *rw_image, *tmp;
+ int ret;
+ u32 size = image_end - image;
+
+ /* rw_image doesn't need to be in module memory range, so we can
+ * use kvmalloc.
+ */
+ rw_image = kvmalloc(size, GFP_KERNEL);
+ if (!rw_image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
+ flags, tlinks, func_addr);
+ if (ret < 0)
+ goto out;
+
+ tmp = bpf_arch_text_copy(image, rw_image, size);
+ if (IS_ERR(tmp))
+ ret = PTR_ERR(tmp);
+out:
+ kvfree(rw_image);
+ return ret;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
+{
+ struct bpf_tramp_image im;
+ void *image;
+ int ret;
+
+ /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+ * This will NOT cause fragmentation in direct map, as we do not
+ * call set_memory_*() on this buffer.
+ *
+ * We cannot use kvmalloc here, because we need image to be in
+ * module memory range.
+ */
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
+ m, flags, tlinks, func_addr);
+ bpf_jit_free_exec(image);
+ return ret;
+}
+
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog;
@@ -2865,9 +3091,16 @@ out_image:
jit_data->header = header;
jit_data->rw_header = rw_header;
}
- prog->bpf_func = (void *)image;
+ /*
+ * ctx.prog_offset is used when CFI preambles put code *before*
+ * the function. See emit_cfi(). For FineIBT specifically this code
+ * can also be executed and bpf_prog_kallsyms_add() will
+ * generate an additional symbol to cover this, hence also
+ * decrement proglen.
+ */
+ prog->bpf_func = (void *)image + cfi_get_offset();
prog->jited = 1;
- prog->jited_len = proglen;
+ prog->jited_len = proglen - cfi_get_offset();
} else {
prog = orig_prog;
}
@@ -2922,6 +3155,7 @@ void bpf_jit_free(struct bpf_prog *prog)
kvfree(jit_data->addrs);
kfree(jit_data);
}
+ prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
@@ -2930,6 +3164,32 @@ void bpf_jit_free(struct bpf_prog *prog)
bpf_prog_unlock_free(prog);
}
+bool bpf_jit_supports_exceptions(void)
+{
+ /* We unwind through both kernel frames (starting from within bpf_throw
+ * call) and BPF frames. Therefore we require ORC unwinder to be enabled
+ * to walk kernel frames and reach BPF frames in the stack trace.
+ */
+ return IS_ENABLED(CONFIG_UNWINDER_ORC);
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+#if defined(CONFIG_UNWINDER_ORC)
+ struct unwind_state state;
+ unsigned long addr;
+
+ for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
+ break;
+ }
+ return;
+#endif
+ WARN(1, "verification of programs using bpf_throw should have failed\n");
+}
+
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
struct bpf_prog *new, struct bpf_prog *old)
{
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 429a89c54..b18ce1998 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1194,7 +1194,7 @@ struct jit_context {
#define PROLOGUE_SIZE 35
/*
- * Emit prologue code for BPF program and check it's size.
+ * Emit prologue code for BPF program and check its size.
* bpf_tail_call helper will skip it while jumping into another program.
*/
static void emit_prologue(u8 **pprog, u32 stack_depth)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index ea2eb2ec9..55c4b07ec 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -283,6 +283,9 @@ static int setup_mcfg_map(struct acpi_pci_root_info *ci)
info->mcfg_added = false;
seg = info->sd.domain;
+ dev_dbg(dev, "%s(%04x %pR ECAM %pa)\n", __func__, seg,
+ &root->secondary, &root->mcfg_addr);
+
/* return success if MMCFG is not in use */
if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
return 0;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index f347c2024..b33afb240 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -907,6 +907,54 @@ static void chromeos_fixup_apl_pci_l1ss_capability(struct pci_dev *dev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_save_apl_pci_l1ss_capability);
DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_fixup_apl_pci_l1ss_capability);
+/*
+ * Disable D3cold on Asus B1400 PCI-NVMe bridge
+ *
+ * On this platform with VMD off, the NVMe device cannot successfully power
+ * back on from D3cold. This appears to be an untested transition by the
+ * vendor: Windows leaves the NVMe and parent bridge in D0 during suspend.
+ *
+ * We disable D3cold on the parent bridge for simplicity, and the fact that
+ * both parent bridge and NVMe device share the same power resource.
+ *
+ * This is only needed on BIOS versions before 308; the newer versions flip
+ * StorageD3Enable from 1 to 0.
+ */
+static const struct dmi_system_id asus_nvme_broken_d3cold_table[] = {
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.304"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.305"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.306"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.307"),
+ },
+ },
+ {}
+};
+
+static void asus_disable_nvme_d3cold(struct pci_dev *pdev)
+{
+ if (dmi_check_system(asus_nvme_broken_d3cold_table) > 0)
+ pci_d3cold_disable(pdev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x9a09, asus_disable_nvme_d3cold);
+
#ifdef CONFIG_SUSPEND
/*
* Root Ports on some AMD SoCs advertise PME_Support for D3hot and D3cold, but
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index e9497ee0f..0cc952066 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * mmconfig-shared.c - Low-level direct PCI config space access via
- * MMCONFIG - common code between i386 and x86-64.
+ * Low-level direct PCI config space access via ECAM - common code between
+ * i386 and x86-64.
*
* This code does:
* - known chipset handling
@@ -11,6 +11,8 @@
* themselves.
*/
+#define pr_fmt(fmt) "PCI: " fmt
+
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/pci.h>
@@ -24,9 +26,7 @@
#include <asm/pci_x86.h>
#include <asm/acpi.h>
-#define PREFIX "PCI: "
-
-/* Indicate if the mmcfg resources have been placed into the resource table. */
+/* Indicate if the ECAM resources have been placed into the resource table */
static bool pci_mmcfg_running_state;
static bool pci_mmcfg_arch_init_failed;
static DEFINE_MUTEX(pci_mmcfg_lock);
@@ -90,7 +90,7 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
- "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
+ "PCI ECAM %04x [bus %02x-%02x]", segment, start, end);
res->name = new->name;
return new;
@@ -102,16 +102,15 @@ struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
struct pci_mmcfg_region *new;
new = pci_mmconfig_alloc(segment, start, end, addr);
- if (new) {
- mutex_lock(&pci_mmcfg_lock);
- list_add_sorted(new);
- mutex_unlock(&pci_mmcfg_lock);
+ if (!new)
+ return NULL;
- pr_info(PREFIX
- "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
- "(base %#lx)\n",
- segment, start, end, &new->res, (unsigned long)addr);
- }
+ mutex_lock(&pci_mmcfg_lock);
+ list_add_sorted(new);
+ mutex_unlock(&pci_mmcfg_lock);
+
+ pr_info("ECAM %pR (base %#lx) for domain %04x [bus %02x-%02x]\n",
+ &new->res, (unsigned long)addr, segment, start, end);
return new;
}
@@ -205,7 +204,7 @@ static const char *__init pci_mmcfg_amd_fam10h(void)
msr <<= 32;
msr |= low;
- /* mmconfig is not enable */
+ /* ECAM is not enabled */
if (!(msr & FAM10H_MMIO_CONF_ENABLE))
return NULL;
@@ -367,7 +366,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
name = pci_mmcfg_probes[i].probe();
if (name)
- pr_info(PREFIX "%s with MMCONFIG support\n", name);
+ pr_info("%s with ECAM support\n", name);
}
/* some end_bus_number is crazy, fix it */
@@ -443,9 +442,11 @@ static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used)
return mcfg_res.flags;
}
-static bool is_efi_mmio(u64 start, u64 end, enum e820_type not_used)
+static bool is_efi_mmio(struct resource *res)
{
#ifdef CONFIG_EFI
+ u64 start = res->start;
+ u64 end = res->start + resource_size(res);
efi_memory_desc_t *md;
u64 size, mmio_start, mmio_end;
@@ -455,11 +456,6 @@ static bool is_efi_mmio(u64 start, u64 end, enum e820_type not_used)
mmio_start = md->phys_addr;
mmio_end = mmio_start + size;
- /*
- * N.B. Caller supplies (start, start + size),
- * so to match, mmio_end is the first address
- * *past* the EFI_MEMORY_MAPPED_IO area.
- */
if (mmio_start <= start && end <= mmio_end)
return true;
}
@@ -490,11 +486,10 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
return false;
if (dev)
- dev_info(dev, "MMCONFIG at %pR reserved as %s\n",
+ dev_info(dev, "ECAM %pR reserved as %s\n",
&cfg->res, method);
else
- pr_info(PREFIX "MMCONFIG at %pR reserved as %s\n",
- &cfg->res, method);
+ pr_info("ECAM %pR reserved as %s\n", &cfg->res, method);
if (old_size != size) {
/* update end_bus */
@@ -503,27 +498,23 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
cfg->res.end = cfg->res.start +
PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
- "PCI MMCONFIG %04x [bus %02x-%02x]",
+ "PCI ECAM %04x [bus %02x-%02x]",
cfg->segment, cfg->start_bus, cfg->end_bus);
if (dev)
- dev_info(dev,
- "MMCONFIG "
- "at %pR (base %#lx) (size reduced!)\n",
- &cfg->res, (unsigned long) cfg->address);
+ dev_info(dev, "ECAM %pR (base %#lx) (size reduced!)\n",
+ &cfg->res, (unsigned long) cfg->address);
else
- pr_info(PREFIX
- "MMCONFIG for %04x [bus%02x-%02x] "
- "at %pR (base %#lx) (size reduced!)\n",
- cfg->segment, cfg->start_bus, cfg->end_bus,
- &cfg->res, (unsigned long) cfg->address);
+ pr_info("ECAM %pR (base %#lx) for %04x [bus%02x-%02x] (size reduced!)\n",
+ &cfg->res, (unsigned long) cfg->address,
+ cfg->segment, cfg->start_bus, cfg->end_bus);
}
return true;
}
-static bool __ref
-pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early)
+static bool __ref pci_mmcfg_reserved(struct device *dev,
+ struct pci_mmcfg_region *cfg, int early)
{
struct resource *conflict;
@@ -533,25 +524,22 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
return true;
if (dev)
- dev_info(dev, FW_INFO
- "MMCONFIG at %pR not reserved in "
- "ACPI motherboard resources\n",
+ dev_info(dev, FW_INFO "ECAM %pR not reserved in ACPI motherboard resources\n",
&cfg->res);
else
- pr_info(FW_INFO PREFIX
- "MMCONFIG at %pR not reserved in "
- "ACPI motherboard resources\n",
- &cfg->res);
+ pr_info(FW_INFO "ECAM %pR not reserved in ACPI motherboard resources\n",
+ &cfg->res);
- if (is_mmconf_reserved(is_efi_mmio, cfg, dev,
- "EfiMemoryMappedIO")) {
+ if (is_efi_mmio(&cfg->res)) {
+ pr_info("ECAM %pR is EfiMemoryMappedIO; assuming valid\n",
+ &cfg->res);
conflict = insert_resource_conflict(&iomem_resource,
&cfg->res);
if (conflict)
- pr_warn("MMCONFIG %pR conflicts with %s %pR\n",
+ pr_warn("ECAM %pR conflicts with %s %pR\n",
&cfg->res, conflict->name, conflict);
else
- pr_info("MMCONFIG %pR reserved to work around lack of ACPI motherboard _CRS\n",
+ pr_info("ECAM %pR reserved to work around lack of ACPI motherboard _CRS\n",
&cfg->res);
return true;
}
@@ -580,30 +568,31 @@ static void __init pci_mmcfg_reject_broken(int early)
struct pci_mmcfg_region *cfg;
list_for_each_entry(cfg, &pci_mmcfg_list, list) {
- if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
- pr_info(PREFIX "not using MMCONFIG\n");
+ if (!pci_mmcfg_reserved(NULL, cfg, early)) {
+ pr_info("not using ECAM (%pR not reserved)\n",
+ &cfg->res);
free_all_mmcfg();
return;
}
}
}
-static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
- struct acpi_mcfg_allocation *cfg)
+static bool __init acpi_mcfg_valid_entry(struct acpi_table_mcfg *mcfg,
+ struct acpi_mcfg_allocation *cfg)
{
if (cfg->address < 0xFFFFFFFF)
- return 0;
+ return true;
if (!strncmp(mcfg->header.oem_id, "SGI", 3))
- return 0;
+ return true;
if ((mcfg->header.revision >= 1) && (dmi_get_bios_year() >= 2010))
- return 0;
+ return true;
- pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
- "is above 4GB, ignored\n", cfg->pci_segment,
- cfg->start_bus_number, cfg->end_bus_number, cfg->address);
- return -EINVAL;
+ pr_err("ECAM at %#llx for %04x [bus %02x-%02x] is above 4GB, ignored\n",
+ cfg->address, cfg->pci_segment, cfg->start_bus_number,
+ cfg->end_bus_number);
+ return false;
}
static int __init pci_parse_mcfg(struct acpi_table_header *header)
@@ -627,21 +616,21 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
i -= sizeof(struct acpi_mcfg_allocation);
}
if (entries == 0) {
- pr_err(PREFIX "MMCONFIG has no entries\n");
+ pr_err("MCFG has no entries\n");
return -ENODEV;
}
cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
for (i = 0; i < entries; i++) {
cfg = &cfg_table[i];
- if (acpi_mcfg_check_entry(mcfg, cfg)) {
+ if (!acpi_mcfg_valid_entry(mcfg, cfg)) {
free_all_mmcfg();
return -ENODEV;
}
if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
cfg->end_bus_number, cfg->address) == NULL) {
- pr_warn(PREFIX "no memory for MCFG entries\n");
+ pr_warn("no memory for MCFG entries\n");
free_all_mmcfg();
return -ENOMEM;
}
@@ -678,6 +667,8 @@ static int pci_mmcfg_for_each_region(int (*func)(__u64 start, __u64 size,
static void __init __pci_mmcfg_init(int early)
{
+ pr_debug("%s(%s)\n", __func__, early ? "early" : "late");
+
pci_mmcfg_reject_broken(early);
if (list_empty(&pci_mmcfg_list))
return;
@@ -704,6 +695,8 @@ static int __initdata known_bridge;
void __init pci_mmcfg_early_init(void)
{
+ pr_debug("%s() pci_probe %#x\n", __func__, pci_probe);
+
if (pci_probe & PCI_PROBE_MMCONF) {
if (pci_mmcfg_check_hostbridge())
known_bridge = 1;
@@ -717,14 +710,16 @@ void __init pci_mmcfg_early_init(void)
void __init pci_mmcfg_late_init(void)
{
- /* MMCONFIG disabled */
+ pr_debug("%s() pci_probe %#x\n", __func__, pci_probe);
+
+ /* ECAM disabled */
if ((pci_probe & PCI_PROBE_MMCONF) == 0)
return;
if (known_bridge)
return;
- /* MMCONFIG hasn't been enabled yet, try again */
+ /* ECAM hasn't been enabled yet, try again */
if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
__pci_mmcfg_init(0);
@@ -737,7 +732,9 @@ static int __init pci_mmcfg_late_insert_resources(void)
pci_mmcfg_running_state = true;
- /* If we are not using MMCONFIG, don't insert the resources. */
+ pr_debug("%s() pci_probe %#x\n", __func__, pci_probe);
+
+ /* If we are not using ECAM, don't insert the resources. */
if ((pci_probe & PCI_PROBE_MMCONF) == 0)
return 1;
@@ -746,21 +743,24 @@ static int __init pci_mmcfg_late_insert_resources(void)
* marked so it won't cause request errors when __request_region is
* called.
*/
- list_for_each_entry(cfg, &pci_mmcfg_list, list)
- if (!cfg->res.parent)
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (!cfg->res.parent) {
+ pr_debug("%s() insert %pR\n", __func__, &cfg->res);
insert_resource(&iomem_resource, &cfg->res);
+ }
+ }
return 0;
}
/*
- * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * Perform ECAM resource insertion after PCI initialization to allow for
* misprogrammed MCFG tables that state larger sizes but actually conflict
* with other system resources.
*/
late_initcall(pci_mmcfg_late_insert_resources);
-/* Add MMCFG information for host bridges */
+/* Add ECAM information for host bridges */
int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
phys_addr_t addr)
{
@@ -768,6 +768,8 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
struct resource *tmp = NULL;
struct pci_mmcfg_region *cfg;
+ dev_dbg(dev, "%s(%04x [bus %02x-%02x])\n", __func__, seg, start, end);
+
if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
return -ENODEV;
@@ -778,15 +780,17 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
cfg = pci_mmconfig_lookup(seg, start);
if (cfg) {
if (cfg->end_bus < end)
- dev_info(dev, FW_INFO
- "MMCONFIG for "
- "domain %04x [bus %02x-%02x] "
- "only partially covers this bridge\n",
- cfg->segment, cfg->start_bus, cfg->end_bus);
+ dev_info(dev, FW_INFO "ECAM %pR for domain %04x [bus %02x-%02x] only partially covers this bridge\n",
+ &cfg->res, cfg->segment, cfg->start_bus,
+ cfg->end_bus);
mutex_unlock(&pci_mmcfg_lock);
return -EEXIST;
}
+ /*
+ * Don't move earlier; we must return -EEXIST, not -EINVAL, if
+ * pci_mmconfig_lookup() finds something
+ */
if (!addr) {
mutex_unlock(&pci_mmcfg_lock);
return -EINVAL;
@@ -795,10 +799,10 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
rc = -EBUSY;
cfg = pci_mmconfig_alloc(seg, start, end, addr);
if (cfg == NULL) {
- dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
+ dev_warn(dev, "fail to add ECAM (out of memory)\n");
rc = -ENOMEM;
- } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
- dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
+ } else if (!pci_mmcfg_reserved(dev, cfg, 0)) {
+ dev_warn(dev, FW_BUG "ECAM %pR isn't reserved\n",
&cfg->res);
} else {
/* Insert resource if it's not in boot stage */
@@ -807,16 +811,13 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
&cfg->res);
if (tmp) {
- dev_warn(dev,
- "MMCONFIG %pR conflicts with "
- "%s %pR\n",
+ dev_warn(dev, "ECAM %pR conflicts with %s %pR\n",
&cfg->res, tmp->name, tmp);
} else if (pci_mmcfg_arch_map(cfg)) {
- dev_warn(dev, "fail to map MMCONFIG %pR.\n",
- &cfg->res);
+ dev_warn(dev, "fail to map ECAM %pR\n", &cfg->res);
} else {
list_add_sorted(cfg);
- dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
+ dev_info(dev, "ECAM %pR (base %#lx)\n",
&cfg->res, (unsigned long)addr);
cfg = NULL;
rc = 0;
@@ -834,7 +835,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
return rc;
}
-/* Delete MMCFG information for host bridges */
+/* Delete ECAM information for host bridges */
int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
{
struct pci_mmcfg_region *cfg;
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index bfa789875..f9ef97c59 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -131,7 +131,7 @@ const struct pci_raw_ops pci_mmcfg = {
int __init pci_mmcfg_arch_init(void)
{
- printk(KERN_INFO "PCI: Using MMCONFIG for extended config space\n");
+ printk(KERN_INFO "PCI: Using ECAM for extended config space\n");
raw_pci_ext_ops = &pci_mmcfg;
return 1;
}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 0c7b6e66c..cb5aa79a7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -6,6 +6,8 @@
* space mapped. This allows lockless config space operation.
*/
+#define pr_fmt(fmt) "PCI: " fmt
+
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/acpi.h>
@@ -14,8 +16,6 @@
#include <asm/e820/api.h>
#include <asm/pci_x86.h>
-#define PREFIX "PCI: "
-
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
@@ -111,6 +111,25 @@ static void __iomem *mcfg_ioremap(struct pci_mmcfg_region *cfg)
return addr;
}
+int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
+{
+ cfg->virt = mcfg_ioremap(cfg);
+ if (!cfg->virt) {
+ pr_err("can't map ECAM at %pR\n", &cfg->res);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
+{
+ if (cfg && cfg->virt) {
+ iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
+ cfg->virt = NULL;
+ }
+}
+
int __init pci_mmcfg_arch_init(void)
{
struct pci_mmcfg_region *cfg;
@@ -133,22 +152,3 @@ void __init pci_mmcfg_arch_free(void)
list_for_each_entry(cfg, &pci_mmcfg_list, list)
pci_mmcfg_arch_unmap(cfg);
}
-
-int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
-{
- cfg->virt = mcfg_ioremap(cfg);
- if (!cfg->virt) {
- pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
-{
- if (cfg && cfg->virt) {
- iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
- cfg->virt = NULL;
- }
-}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 4f1528073..244c643bb 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -3,6 +3,8 @@
* BIOS32 and PCI BIOS handling.
*/
+#include <linux/bits.h>
+#include <linux/bitfield.h>
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -29,8 +31,19 @@
#define PCIBIOS_HW_TYPE1_SPEC 0x10
#define PCIBIOS_HW_TYPE2_SPEC 0x20
+/*
+ * Returned in EAX:
+ * - AH: return code
+ */
+#define PCIBIOS_RETURN_CODE GENMASK(15, 8)
+
int pcibios_enabled;
+static u8 pcibios_get_return_code(u32 eax)
+{
+ return FIELD_GET(PCIBIOS_RETURN_CODE, eax);
+}
+
/* According to the BIOS specification at:
* http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
* restrict the x zone to some pages and make it ro. But this may be
@@ -154,7 +167,7 @@ static int __init check_pcibios(void)
: "memory");
local_irq_restore(flags);
- status = (eax >> 8) & 0xff;
+ status = pcibios_get_return_code(eax);
hw_mech = eax & 0xff;
major_ver = (ebx >> 8) & 0xff;
minor_ver = ebx & 0xff;
@@ -227,7 +240,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
- return (int)((result & 0xff00) >> 8);
+ return pcibios_get_return_code(result);
}
static int pci_bios_write(unsigned int seg, unsigned int bus,
@@ -269,7 +282,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
- return (int)((result & 0xff00) >> 8);
+ return pcibios_get_return_code(result);
}
@@ -385,9 +398,10 @@ struct irq_routing_table * pcibios_get_irq_routing_table(void)
"m" (opt)
: "memory");
DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map);
- if (ret & 0xff00)
- printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff);
- else if (opt.size) {
+ ret = pcibios_get_return_code(ret);
+ if (ret) {
+ printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", ret);
+ } else if (opt.size) {
rt = kmalloc(sizeof(struct irq_routing_table) + opt.size, GFP_KERNEL);
if (rt) {
memset(rt, 0, sizeof(struct irq_routing_table));
@@ -415,7 +429,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq)
"b" ((dev->bus->number << 8) | dev->devfn),
"c" ((irq << 8) | (pin + 10)),
"S" (&pci_indirect));
- return !(ret & 0xff00);
+ return pcibios_get_return_code(ret) == PCIBIOS_SUCCESSFUL;
}
EXPORT_SYMBOL(pcibios_set_irq_routing);
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7368afc03..8c8ddc4dc 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -14,6 +14,7 @@
#include <linux/dma-map-ops.h>
#include <linux/swiotlb.h>
#include <asm/iommu.h>
+#include <asm/sta2x11.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 761f3689f..84ba715f4 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -6,7 +6,7 @@
* Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
*
* IMR self test. The purpose of this module is to run a set of tests on the
- * IMR API to validate it's sanity. We check for overlapping, reserved
+ * IMR API to validate its sanity. We check for overlapping, reserved
* addresses and setup/teardown sanity.
*
*/
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 00a92cb2c..a12117f3d 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
} else
xen_raw_printk("Warning: Can fit ISA range into e820\n");
+ if (xen_guest)
+ xen_reserve_extra_memory(&pvh_bootparams);
+
pvh_bootparams.hdr.cmd_line_ptr =
pvh_start_info.cmdline_paddr;
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index c4365a05a..f7235ef87 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -11,6 +11,7 @@
#include <linux/elfnote.h>
#include <linux/init.h>
#include <linux/linkage.h>
+#include <asm/desc_defs.h>
#include <asm/segment.h>
#include <asm/asm.h>
#include <asm/boot.h>
@@ -41,7 +42,7 @@
* Bit 8 (TF) must be cleared. Other bits are all unspecified.
*
* All other processor registers and flag bits are unspecified. The OS is in
- * charge of setting up it's own stack, GDT and IDT.
+ * charge of setting up its own stack, GDT and IDT.
*/
#define PVH_GDT_ENTRY_CS 1
@@ -148,11 +149,11 @@ SYM_DATA_END(gdt)
SYM_DATA_START_LOCAL(gdt_start)
.quad 0x0000000000000000 /* NULL descriptor */
#ifdef CONFIG_X86_64
- .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */
+ .quad GDT_ENTRY(DESC_CODE64, 0, 0xfffff) /* PVH_CS_SEL */
#else
- .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
+ .quad GDT_ENTRY(DESC_CODE32, 0, 0xfffff) /* PVH_CS_SEL */
#endif
- .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
+ .quad GDT_ENTRY(DESC_DATA32, 0, 0xfffff) /* PVH_DS_SEL */
SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end)
.balign 16
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 4221259a5..a379501b7 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -35,7 +35,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
entry->vector = cfg->vector;
- entry->delivery_mode = apic->delivery_mode;
+ entry->delivery_mode = APIC_DELIVERY_MODE_FIXED;
entry->dest_mode = apic->dest_mode_logical;
entry->polarity = 0;
entry->trigger = 0;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 45d0c17ce..5c50e550a 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -17,6 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/clocksource.h>
#include <asm/apic.h>
@@ -178,49 +179,56 @@ module_param_named(debug, uv_nmi_debug, int, 0644);
} while (0)
/* Valid NMI Actions */
-#define ACTION_LEN 16
-static struct nmi_action {
- char *action;
- char *desc;
-} valid_acts[] = {
- { "kdump", "do kernel crash dump" },
- { "dump", "dump process stack for each cpu" },
- { "ips", "dump Inst Ptr info for each cpu" },
- { "kdb", "enter KDB (needs kgdboc= assignment)" },
- { "kgdb", "enter KGDB (needs gdb target remote)" },
- { "health", "check if CPUs respond to NMI" },
+enum action_t {
+ nmi_act_kdump,
+ nmi_act_dump,
+ nmi_act_ips,
+ nmi_act_kdb,
+ nmi_act_kgdb,
+ nmi_act_health,
+ nmi_act_max
};
-typedef char action_t[ACTION_LEN];
-static action_t uv_nmi_action = { "dump" };
+
+static const char * const actions[nmi_act_max] = {
+ [nmi_act_kdump] = "kdump",
+ [nmi_act_dump] = "dump",
+ [nmi_act_ips] = "ips",
+ [nmi_act_kdb] = "kdb",
+ [nmi_act_kgdb] = "kgdb",
+ [nmi_act_health] = "health",
+};
+
+static const char * const actions_desc[nmi_act_max] = {
+ [nmi_act_kdump] = "do kernel crash dump",
+ [nmi_act_dump] = "dump process stack for each cpu",
+ [nmi_act_ips] = "dump Inst Ptr info for each cpu",
+ [nmi_act_kdb] = "enter KDB (needs kgdboc= assignment)",
+ [nmi_act_kgdb] = "enter KGDB (needs gdb target remote)",
+ [nmi_act_health] = "check if CPUs respond to NMI",
+};
+
+static enum action_t uv_nmi_action = nmi_act_dump;
static int param_get_action(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%s\n", uv_nmi_action);
+ return sprintf(buffer, "%s\n", actions[uv_nmi_action]);
}
static int param_set_action(const char *val, const struct kernel_param *kp)
{
- int i;
- int n = ARRAY_SIZE(valid_acts);
- char arg[ACTION_LEN];
-
- /* (remove possible '\n') */
- strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1);
-
- for (i = 0; i < n; i++)
- if (!strcmp(arg, valid_acts[i].action))
- break;
+ int i, n = ARRAY_SIZE(actions);
- if (i < n) {
- strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action));
- pr_info("UV: New NMI action:%s\n", uv_nmi_action);
+ i = sysfs_match_string(actions, val);
+ if (i >= 0) {
+ uv_nmi_action = i;
+ pr_info("UV: New NMI action:%s\n", actions[i]);
return 0;
}
- pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg);
+ pr_err("UV: Invalid NMI action. Valid actions are:\n");
for (i = 0; i < n; i++)
- pr_err("UV: %-8s - %s\n",
- valid_acts[i].action, valid_acts[i].desc);
+ pr_err("UV: %-8s - %s\n", actions[i], actions_desc[i]);
+
return -EINVAL;
}
@@ -228,15 +236,10 @@ static const struct kernel_param_ops param_ops_action = {
.get = param_get_action,
.set = param_set_action,
};
-#define param_check_action(name, p) __param_check(name, p, action_t)
+#define param_check_action(name, p) __param_check(name, p, enum action_t)
module_param_named(action, uv_nmi_action, action, 0644);
-static inline bool uv_nmi_action_is(const char *action)
-{
- return (strncmp(uv_nmi_action, action, strlen(action)) == 0);
-}
-
/* Setup which NMI support is present in system */
static void uv_nmi_setup_mmrs(void)
{
@@ -727,10 +730,10 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
if (cpu == 0)
uv_nmi_dump_cpu_ip_hdr();
- if (current->pid != 0 || !uv_nmi_action_is("ips"))
+ if (current->pid != 0 || uv_nmi_action != nmi_act_ips)
uv_nmi_dump_cpu_ip(cpu, regs);
- if (uv_nmi_action_is("dump")) {
+ if (uv_nmi_action == nmi_act_dump) {
pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
show_regs(regs);
}
@@ -738,7 +741,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
}
-/* Trigger a slave CPU to dump it's state */
+/* Trigger a slave CPU to dump its state */
static void uv_nmi_trigger_dump(int cpu)
{
int retry = uv_nmi_trigger_delay;
@@ -798,7 +801,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
int saved_console_loglevel = console_loglevel;
pr_alert("UV: tracing %s for %d CPUs from CPU %d\n",
- uv_nmi_action_is("ips") ? "IPs" : "processes",
+ uv_nmi_action == nmi_act_ips ? "IPs" : "processes",
atomic_read(&uv_nmi_cpus_in_nmi), cpu);
console_loglevel = uv_nmi_loglevel;
@@ -874,7 +877,7 @@ static inline int uv_nmi_kdb_reason(void)
static inline int uv_nmi_kdb_reason(void)
{
/* Ensure user is expecting to attach gdb remote */
- if (uv_nmi_action_is("kgdb"))
+ if (uv_nmi_action == nmi_act_kgdb)
return 0;
pr_err("UV: NMI error: KDB is not enabled in this kernel\n");
@@ -950,28 +953,35 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
master = (atomic_read(&uv_nmi_cpu) == cpu);
/* If NMI action is "kdump", then attempt to do it */
- if (uv_nmi_action_is("kdump")) {
+ if (uv_nmi_action == nmi_act_kdump) {
uv_nmi_kdump(cpu, master, regs);
/* Unexpected return, revert action to "dump" */
if (master)
- strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action));
+ uv_nmi_action = nmi_act_dump;
}
/* Pause as all CPU's enter the NMI handler */
uv_nmi_wait(master);
/* Process actions other than "kdump": */
- if (uv_nmi_action_is("health")) {
+ switch (uv_nmi_action) {
+ case nmi_act_health:
uv_nmi_action_health(cpu, regs, master);
- } else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
+ break;
+ case nmi_act_ips:
+ case nmi_act_dump:
uv_nmi_dump_state(cpu, regs, master);
- } else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) {
+ break;
+ case nmi_act_kdb:
+ case nmi_act_kgdb:
uv_call_kgdb_kdb(cpu, regs, master);
- } else {
+ break;
+ default:
if (master)
- pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action);
+ pr_alert("UV: unknown NMI action: %d\n", uv_nmi_action);
uv_nmi_sync_exit(master);
+ break;
}
/* Clear per_cpu "in_nmi" flag */
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 54663f3e0..3712afc35 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -53,7 +53,7 @@ struct uv_rtc_timer_head {
struct {
int lcpu; /* systemwide logical cpu number */
u64 expires; /* next timer expiration for this cpu */
- } cpu[];
+ } cpu[] __counted_by(ncpus);
};
/*
@@ -270,7 +270,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
* Read the RTC.
*
* Starting with HUB rev 2.0, the UV RTC register is replicated across all
- * cachelines of it's own page. This allows faster simultaneous reads
+ * cachelines of its own page. This allows faster simultaneous reads
* from a given socket.
*/
static u64 uv_read_rtc(struct clocksource *cs)
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index 6f955eb1e..d8af46e67 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -170,7 +170,7 @@ int relocate_restore_code(void)
goto out;
}
pud = pud_offset(p4d, relocated_restore_code);
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
goto out;
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 08aa0f25f..8d1c82795 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -42,7 +42,8 @@ KCOV_INSTRUMENT := n
# make up the standalone purgatory.ro
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
-PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS += -fpic -fvisibility=hidden
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 788e55595..f9bc444a3 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -61,7 +61,7 @@ void __init reserve_real_mode(void)
set_real_mode_mem(mem);
/*
- * Unconditionally reserve the entire fisrt 1M, see comment in
+ * Unconditionally reserve the entire first 1M, see comment in
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
index f10515b10..e714b4624 100644
--- a/arch/x86/realmode/rm/reboot.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
+#include <asm/desc_defs.h>
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
@@ -153,5 +154,5 @@ SYM_DATA_START(machine_real_restart_gdt)
* base value 0x100; since this is consistent with real mode
* semantics we don't have to reload the segments once CR0.PE = 0.
*/
- .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
+ .quad GDT_ENTRY(DESC_DATA16, 0x100, 0xffff)
SYM_DATA_END(machine_real_restart_gdt)
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 90e820ac9..7278e2545 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -17,7 +17,7 @@ reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk
chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
quiet_cmd_posttest = TEST $@
- cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
+ cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
quiet_cmd_sanitytest = TEST $@
cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
deleted file mode 100644
index a4cf678cf..000000000
--- a/arch/x86/tools/chkobjdump.awk
+++ /dev/null
@@ -1,34 +0,0 @@
-# GNU objdump version checker
-#
-# Usage:
-# objdump -v | awk -f chkobjdump.awk
-BEGIN {
- # objdump version 2.19 or later is OK for the test.
- od_ver = 2;
- od_sver = 19;
-}
-
-/^GNU objdump/ {
- verstr = ""
- gsub(/\(.*\)/, "");
- for (i = 3; i <= NF; i++)
- if (match($(i), "^[0-9]")) {
- verstr = $(i);
- break;
- }
- if (verstr == "") {
- printf("Warning: Failed to find objdump version number.\n");
- exit 0;
- }
- split(verstr, ver, ".");
- if (ver[1] > od_ver ||
- (ver[1] == od_ver && ver[2] >= od_sver)) {
- exit 1;
- } else {
- printf("Warning: objdump version %s is older than %d.%d\n",
- verstr, od_ver, od_sver);
- print("Warning: Skipping posttest.");
- # Logic is inverted, because we just skip test without error.
- exit 0;
- }
-}
diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk
index f418c91b7..20b08a6c4 100644
--- a/arch/x86/tools/objdump_reformat.awk
+++ b/arch/x86/tools/objdump_reformat.awk
@@ -11,8 +11,8 @@ BEGIN {
prev_addr = ""
prev_hex = ""
prev_mnemonic = ""
- bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
- fwait_expr = "^9b "
+ bad_expr = "(\\(bad\\)|<unknown>|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b[ \t]*fwait"
fwait_str="9b\tfwait"
}
@@ -22,7 +22,7 @@ BEGIN {
}
/^ *[0-9a-f]+:/ {
- if (split($0, field, "\t") < 3) {
+ if (split($0, field, /: |\t/) < 3) {
# This is a continuation of the same insn.
prev_hex = prev_hex field[2]
} else {
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index d30949e25..e7a44a7f6 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -66,7 +66,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
[S_REL] =
"^(__init_(begin|end)|"
"__x86_cpu_dev_(start|end)|"
- "(__parainstructions|__alt_instructions)(_end)?|"
+ "__alt_instructions(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
#if CONFIG_FW_LOADER
@@ -653,6 +653,14 @@ static void print_absolute_relocs(void)
if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
continue;
}
+ /*
+ * Do not perform relocations in .notes section; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE) {
+ continue;
+ }
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
@@ -738,6 +746,15 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
continue;
}
+
+ /*
+ * Do not perform relocations in .notes sections; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
+ continue;
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 6523eb7c3..6052200fe 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -168,8 +168,8 @@ do { \
(pr_reg)[18] = (_regs)->regs.gp[18]; \
(pr_reg)[19] = (_regs)->regs.gp[19]; \
(pr_reg)[20] = (_regs)->regs.gp[20]; \
- (pr_reg)[21] = current->thread.arch.fs; \
- (pr_reg)[22] = 0; \
+ (pr_reg)[21] = (_regs)->regs.gp[21]; \
+ (pr_reg)[22] = (_regs)->regs.gp[22]; \
(pr_reg)[23] = 0; \
(pr_reg)[24] = 0; \
(pr_reg)[25] = 0; \
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
index 1ef9c2187..f90159508 100644
--- a/arch/x86/um/asm/processor_64.h
+++ b/arch/x86/um/asm/processor_64.h
@@ -10,13 +10,11 @@
struct arch_thread {
unsigned long debugregs[8];
int debugregs_seq;
- unsigned long fs;
struct faultinfo faultinfo;
};
#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \
.debugregs_seq = 0, \
- .fs = 0, \
.faultinfo = { 0, 0, 0 } }
#define STACKSLOTS_PER_LINE 4
@@ -28,7 +26,6 @@ static inline void arch_flush_thread(struct arch_thread *thread)
static inline void arch_copy_thread(struct arch_thread *from,
struct arch_thread *to)
{
- to->fs = from->fs;
}
#define current_sp() ({ void *sp; __asm__("movq %%rsp, %0" : "=r" (sp) : ); sp; })
diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile
index ae169125d..5249bbc30 100644
--- a/arch/x86/um/os-Linux/Makefile
+++ b/arch/x86/um/os-Linux/Makefile
@@ -6,7 +6,6 @@
obj-y = registers.o task_size.o mcontext.o
obj-$(CONFIG_X86_32) += tls.o
-obj-$(CONFIG_64BIT) += prctl.o
USER_OBJS := $(obj-y)
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
deleted file mode 100644
index 8431e87ac..000000000
--- a/arch/x86/um/os-Linux/prctl.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit.com,linux.intel.com})
- * Licensed under the GPL
- */
-
-#include <sys/ptrace.h>
-#include <asm/ptrace.h>
-
-int os_arch_prctl(int pid, int option, unsigned long *arg2)
-{
- return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
-}
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 0bc4b73a9..7f1abde2c 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -25,30 +25,6 @@ void arch_switch_to(struct task_struct *to)
printk(KERN_WARNING "arch_switch_tls failed, errno = EINVAL\n");
}
-int is_syscall(unsigned long addr)
-{
- unsigned short instr;
- int n;
-
- n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
- if (n) {
- /* access_process_vm() grants access to vsyscall and stub,
- * while copy_from_user doesn't. Maybe access_process_vm is
- * slow, but that doesn't matter, since it will be called only
- * in case of singlestepping, if copy_from_user failed.
- */
- n = access_process_vm(current, addr, &instr, sizeof(instr),
- FOLL_FORCE);
- if (n != sizeof(instr)) {
- printk(KERN_ERR "is_syscall : failed to read "
- "instruction from 0x%lx\n", addr);
- return 1;
- }
- }
- /* int 0x80 or sysenter */
- return (instr == 0x80cd) || (instr == 0x340f);
-}
-
/* determines which flags the user has access to. */
/* 1 = access 0 = no access */
#define FLAG_MASK 0x00044dd5
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 289d0159b..aa68d83d3 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -188,32 +188,6 @@ int peek_user(struct task_struct *child, long addr, long data)
return put_user(tmp, (unsigned long *) data);
}
-/* XXX Mostly copied from sys-i386 */
-int is_syscall(unsigned long addr)
-{
- unsigned short instr;
- int n;
-
- n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
- if (n) {
- /*
- * access_process_vm() grants access to vsyscall and stub,
- * while copy_from_user doesn't. Maybe access_process_vm is
- * slow, but that doesn't matter, since it will be called only
- * in case of singlestepping, if copy_from_user failed.
- */
- n = access_process_vm(current, addr, &instr, sizeof(instr),
- FOLL_FORCE);
- if (n != sizeof(instr)) {
- printk("is_syscall : failed to read instruction from "
- "0x%lx\n", addr);
- return 1;
- }
- }
- /* sysenter */
- return instr == 0x050f;
-}
-
static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h
index db8478a83..0c4989842 100644
--- a/arch/x86/um/shared/sysdep/ptrace_32.h
+++ b/arch/x86/um/shared/sysdep/ptrace_32.h
@@ -8,10 +8,6 @@
#define MAX_FP_NR HOST_FPX_SIZE
-void set_using_sysemu(int value);
-int get_using_sysemu(void);
-extern int sysemu_supported;
-
#define UPT_SYSCALL_ARG1(r) UPT_BX(r)
#define UPT_SYSCALL_ARG2(r) UPT_CX(r)
#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h
index 44782bbad..1d1a824fa 100644
--- a/arch/x86/um/shared/sysdep/ptrace_user.h
+++ b/arch/x86/um/shared/sysdep/ptrace_user.h
@@ -15,14 +15,12 @@
#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE)
#else
#define FP_SIZE HOST_FP_SIZE
+#endif
/*
- * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though
- * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the
- * 2.4 name and value for 2.4 host compatibility.
+ * glibc before 2.27 does not include PTRACE_SYSEMU_SINGLESTEP in its enum,
+ * ensure we have a definition by (re-)defining it here.
*/
-#ifndef PTRACE_OLDSETOPTIONS
-#define PTRACE_OLDSETOPTIONS 21
-#endif
-
+#ifndef PTRACE_SYSEMU_SINGLESTEP
+#define PTRACE_SYSEMU_SINGLESTEP 32
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 38fa894b6..ea8b5a2d6 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -12,72 +12,79 @@
#define STUB_MMAP_NR __NR_mmap2
#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT)
-static inline long stub_syscall0(long syscall)
+static __always_inline long stub_syscall0(long syscall)
{
long ret;
- __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall));
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall)
+ : "memory");
return ret;
}
-static inline long stub_syscall1(long syscall, long arg1)
+static __always_inline long stub_syscall1(long syscall, long arg1)
{
long ret;
- __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1));
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1)
+ : "memory");
return ret;
}
-static inline long stub_syscall2(long syscall, long arg1, long arg2)
+static __always_inline long stub_syscall2(long syscall, long arg1, long arg2)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2));
+ "c" (arg2)
+ : "memory");
return ret;
}
-static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+static __always_inline long stub_syscall3(long syscall, long arg1, long arg2,
+ long arg3)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3));
+ "c" (arg2), "d" (arg3)
+ : "memory");
return ret;
}
-static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
- long arg4)
+static __always_inline long stub_syscall4(long syscall, long arg1, long arg2,
+ long arg3, long arg4)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3), "S" (arg4));
+ "c" (arg2), "d" (arg3), "S" (arg4)
+ : "memory");
return ret;
}
-static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
- long arg4, long arg5)
+static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5));
+ "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5)
+ : "memory");
return ret;
}
-static inline void trap_myself(void)
+static __always_inline void trap_myself(void)
{
__asm("int3");
}
-static inline void remap_stack_and_trap(void)
+static __always_inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movl %%esp,%%ebx ;"
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 2de1c8f88..b24168ef0 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -16,7 +16,7 @@
#define __syscall_clobber "r11","rcx","memory"
#define __syscall "syscall"
-static inline long stub_syscall0(long syscall)
+static __always_inline long stub_syscall0(long syscall)
{
long ret;
@@ -27,7 +27,7 @@ static inline long stub_syscall0(long syscall)
return ret;
}
-static inline long stub_syscall2(long syscall, long arg1, long arg2)
+static __always_inline long stub_syscall2(long syscall, long arg1, long arg2)
{
long ret;
@@ -38,7 +38,8 @@ static inline long stub_syscall2(long syscall, long arg1, long arg2)
return ret;
}
-static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+static __always_inline long stub_syscall3(long syscall, long arg1, long arg2,
+ long arg3)
{
long ret;
@@ -50,7 +51,7 @@ static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
return ret;
}
-static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
+static __always_inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
long arg4)
{
long ret;
@@ -64,8 +65,8 @@ static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
return ret;
}
-static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
- long arg4, long arg5)
+static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5)
{
long ret;
@@ -78,12 +79,12 @@ static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
return ret;
}
-static inline void trap_myself(void)
+static __always_inline void trap_myself(void)
{
__asm("int3");
}
-static inline void remap_stack_and_trap(void)
+static __always_inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movq %0,%%rax ;"
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 27b29ae6c..6a00a28c9 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -16,60 +16,24 @@
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
{
- unsigned long *ptr = arg2, tmp;
- long ret;
- int pid = task->mm->context.id.u.pid;
-
- /*
- * With ARCH_SET_FS (and ARCH_SET_GS is treated similarly to
- * be safe), we need to call arch_prctl on the host because
- * setting %fs may result in something else happening (like a
- * GDT or thread.fs being set instead). So, we let the host
- * fiddle the registers and thread struct and restore the
- * registers afterwards.
- *
- * So, the saved registers are stored to the process (this
- * needed because a stub may have been the last thing to run),
- * arch_prctl is run on the host, then the registers are read
- * back.
- */
- switch (option) {
- case ARCH_SET_FS:
- case ARCH_SET_GS:
- ret = restore_pid_registers(pid, &current->thread.regs.regs);
- if (ret)
- return ret;
- break;
- case ARCH_GET_FS:
- case ARCH_GET_GS:
- /*
- * With these two, we read to a local pointer and
- * put_user it to the userspace pointer that we were
- * given. If addr isn't valid (because it hasn't been
- * faulted in or is just bogus), we want put_user to
- * fault it in (or return -EFAULT) instead of having
- * the host return -EFAULT.
- */
- ptr = &tmp;
- }
-
- ret = os_arch_prctl(pid, option, ptr);
- if (ret)
- return ret;
+ long ret = -EINVAL;
switch (option) {
case ARCH_SET_FS:
- current->thread.arch.fs = (unsigned long) ptr;
- ret = save_registers(pid, &current->thread.regs.regs);
+ current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ ret = 0;
break;
case ARCH_SET_GS:
- ret = save_registers(pid, &current->thread.regs.regs);
+ current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ ret = 0;
break;
case ARCH_GET_FS:
- ret = put_user(tmp, arg2);
+ ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
break;
case ARCH_GET_GS:
- ret = put_user(tmp, arg2);
+ ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
break;
}
@@ -83,10 +47,10 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
void arch_switch_to(struct task_struct *to)
{
- if ((to->thread.arch.fs == 0) || (to->mm == NULL))
- return;
-
- arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
+ /*
+ * Nothing needs to be done on x86_64.
+ * The FS_BASE/GS_BASE registers are saved in the ptrace register set.
+ */
}
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index ef1eb4f4f..0bf6de40a 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/utsname.h>
diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c
index ebd3855d9..c51a613f6 100644
--- a/arch/x86/um/tls_64.c
+++ b/arch/x86/um/tls_64.c
@@ -12,7 +12,7 @@ int arch_set_tls(struct task_struct *t, unsigned long tls)
* If CLONE_SETTLS is set, we need to save the thread id
* so it can be set during context switches.
*/
- t->thread.arch.fs = tls;
+ t->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = tls;
return 0;
}
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 6825e146a..b86d63473 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -67,15 +67,3 @@ quiet_cmd_vdso = VDSO $@
VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack
GCOV_PROFILE := n
-
-#
-# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
-#
-quiet_cmd_vdso_install = INSTALL $@
- cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
- @mkdir -p $(MODLIB)/vdso
- $(call cmd,vdso_install)
-
-PHONY += vdso_install $(vdso-install-y)
-vdso_install: $(vdso-install-y)
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index 49a045240..1dd6528cc 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -13,16 +13,17 @@
#include <linux/vgaarb.h>
#include <asm/fb.h>
-void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off)
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+ unsigned long vm_start, unsigned long vm_end,
+ unsigned long offset)
{
- unsigned long prot;
-
- prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK;
+ pgprot_val(prot) &= ~_PAGE_CACHE_MASK;
if (boot_cpu_data.x86 > 3)
- pgprot_val(vma->vm_page_prot) =
- prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS);
+ pgprot_val(prot) |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS);
+
+ return prot;
}
-EXPORT_SYMBOL(fb_pgprotect);
+EXPORT_SYMBOL(pgprot_framebuffer);
int fb_is_primary_device(struct fb_info *info)
{
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile
new file mode 100644
index 000000000..1e36502cd
--- /dev/null
+++ b/arch/x86/virt/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += vmx/
diff --git a/arch/x86/virt/vmx/Makefile b/arch/x86/virt/vmx/Makefile
new file mode 100644
index 000000000..feebda21d
--- /dev/null
+++ b/arch/x86/virt/vmx/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_INTEL_TDX_HOST) += tdx/
diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile
new file mode 100644
index 000000000..90da47eb8
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += seamcall.o tdx.o
diff --git a/arch/x86/virt/vmx/tdx/seamcall.S b/arch/x86/virt/vmx/tdx/seamcall.S
new file mode 100644
index 000000000..5b1f2286a
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/seamcall.S
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#include "tdxcall.S"
+
+/*
+ * __seamcall() - Host-side interface functions to SEAM software
+ * (the P-SEAMLDR or the TDX module).
+ *
+ * __seamcall() function ABI:
+ *
+ * @fn (RDI) - SEAMCALL Leaf number, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input
+ *
+ * Only RCX/RDX/R8-R11 are used as input registers.
+ *
+ * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself
+ * fails, or the completion status of the SEAMCALL leaf function.
+ */
+SYM_FUNC_START(__seamcall)
+ TDX_MODULE_CALL host=1
+SYM_FUNC_END(__seamcall)
+
+/*
+ * __seamcall_ret() - Host-side interface functions to SEAM software
+ * (the P-SEAMLDR or the TDX module), with saving output registers to
+ * the 'struct tdx_module_args' used as input.
+ *
+ * __seamcall_ret() function ABI:
+ *
+ * @fn (RDI) - SEAMCALL Leaf number, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input and output
+ *
+ * Only RCX/RDX/R8-R11 are used as input/output registers.
+ *
+ * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself
+ * fails, or the completion status of the SEAMCALL leaf function.
+ */
+SYM_FUNC_START(__seamcall_ret)
+ TDX_MODULE_CALL host=1 ret=1
+SYM_FUNC_END(__seamcall_ret)
+
+/*
+ * __seamcall_saved_ret() - Host-side interface functions to SEAM software
+ * (the P-SEAMLDR or the TDX module), with saving output registers to the
+ * 'struct tdx_module_args' used as input.
+ *
+ * __seamcall_saved_ret() function ABI:
+ *
+ * @fn (RDI) - SEAMCALL Leaf number, moved to RAX
+ * @args (RSI) - struct tdx_module_args for input and output
+ *
+ * All registers in @args are used as input/output registers.
+ *
+ * Return (via RAX) TDX_SEAMCALL_VMFAILINVALID if the SEAMCALL itself
+ * fails, or the completion status of the SEAMCALL leaf function.
+ */
+SYM_FUNC_START(__seamcall_saved_ret)
+ TDX_MODULE_CALL host=1 ret=1 saved=1
+SYM_FUNC_END(__seamcall_saved_ret)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
new file mode 100644
index 000000000..4d6826a76
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -0,0 +1,1492 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright(c) 2023 Intel Corporation.
+ *
+ * Intel Trusted Domain Extensions (TDX) support
+ */
+
+#define pr_fmt(fmt) "virt/tdx: " fmt
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/cpu.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-defs.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+#include <linux/pfn.h>
+#include <linux/align.h>
+#include <linux/sort.h>
+#include <linux/log2.h>
+#include <linux/acpi.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <asm/page.h>
+#include <asm/special_insns.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/cpufeature.h>
+#include <asm/tdx.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/mce.h>
+#include "tdx.h"
+
+static u32 tdx_global_keyid __ro_after_init;
+static u32 tdx_guest_keyid_start __ro_after_init;
+static u32 tdx_nr_guest_keyids __ro_after_init;
+
+static DEFINE_PER_CPU(bool, tdx_lp_initialized);
+
+static struct tdmr_info_list tdx_tdmr_list;
+
+static enum tdx_module_status_t tdx_module_status;
+static DEFINE_MUTEX(tdx_module_lock);
+
+/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
+static LIST_HEAD(tdx_memlist);
+
+typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
+
+static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
+{
+ pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
+}
+
+static inline void seamcall_err_ret(u64 fn, u64 err,
+ struct tdx_module_args *args)
+{
+ seamcall_err(fn, err, args);
+ pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
+ args->rcx, args->rdx, args->r8);
+ pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
+ args->r9, args->r10, args->r11);
+}
+
+static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
+{
+ u64 sret = sc_retry(func, fn, args);
+
+ if (sret == TDX_SUCCESS)
+ return 0;
+
+ if (sret == TDX_SEAMCALL_VMFAILINVALID)
+ return -ENODEV;
+
+ if (sret == TDX_SEAMCALL_GP)
+ return -EOPNOTSUPP;
+
+ if (sret == TDX_SEAMCALL_UD)
+ return -EACCES;
+
+ err_func(fn, sret, args);
+ return -EIO;
+}
+
+#define seamcall_prerr(__fn, __args) \
+ sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
+
+#define seamcall_prerr_ret(__fn, __args) \
+ sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
+
+/*
+ * Do the module global initialization once and return its result.
+ * It can be done on any cpu. It's always called with interrupts
+ * disabled.
+ */
+static int try_init_module_global(void)
+{
+ struct tdx_module_args args = {};
+ static DEFINE_RAW_SPINLOCK(sysinit_lock);
+ static bool sysinit_done;
+ static int sysinit_ret;
+
+ lockdep_assert_irqs_disabled();
+
+ raw_spin_lock(&sysinit_lock);
+
+ if (sysinit_done)
+ goto out;
+
+ /* RCX is module attributes and all bits are reserved */
+ args.rcx = 0;
+ sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+
+ /*
+ * The first SEAMCALL also detects the TDX module, thus
+ * it can fail due to the TDX module is not loaded.
+ * Dump message to let the user know.
+ */
+ if (sysinit_ret == -ENODEV)
+ pr_err("module not loaded\n");
+
+ sysinit_done = true;
+out:
+ raw_spin_unlock(&sysinit_lock);
+ return sysinit_ret;
+}
+
+/**
+ * tdx_cpu_enable - Enable TDX on local cpu
+ *
+ * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
+ * global initialization SEAMCALL if not done) on local cpu to make this
+ * cpu be ready to run any other SEAMCALLs.
+ *
+ * Always call this function via IPI function calls.
+ *
+ * Return 0 on success, otherwise errors.
+ */
+int tdx_cpu_enable(void)
+{
+ struct tdx_module_args args = {};
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return -ENODEV;
+
+ lockdep_assert_irqs_disabled();
+
+ if (__this_cpu_read(tdx_lp_initialized))
+ return 0;
+
+ /*
+ * The TDX module global initialization is the very first step
+ * to enable TDX. Need to do it first (if hasn't been done)
+ * before the per-cpu initialization.
+ */
+ ret = try_init_module_global();
+ if (ret)
+ return ret;
+
+ ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
+ if (ret)
+ return ret;
+
+ __this_cpu_write(tdx_lp_initialized, true);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+
+/*
+ * Add a memory region as a TDX memory block. The caller must make sure
+ * all memory regions are added in address ascending order and don't
+ * overlap.
+ */
+static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
+ unsigned long end_pfn, int nid)
+{
+ struct tdx_memblock *tmb;
+
+ tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
+ if (!tmb)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&tmb->list);
+ tmb->start_pfn = start_pfn;
+ tmb->end_pfn = end_pfn;
+ tmb->nid = nid;
+
+ /* @tmb_list is protected by mem_hotplug_lock */
+ list_add_tail(&tmb->list, tmb_list);
+ return 0;
+}
+
+static void free_tdx_memlist(struct list_head *tmb_list)
+{
+ /* @tmb_list is protected by mem_hotplug_lock */
+ while (!list_empty(tmb_list)) {
+ struct tdx_memblock *tmb = list_first_entry(tmb_list,
+ struct tdx_memblock, list);
+
+ list_del(&tmb->list);
+ kfree(tmb);
+ }
+}
+
+/*
+ * Ensure that all memblock memory regions are convertible to TDX
+ * memory. Once this has been established, stash the memblock
+ * ranges off in a secondary structure because memblock is modified
+ * in memory hotplug while TDX memory regions are fixed.
+ */
+static int build_tdx_memlist(struct list_head *tmb_list)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid, ret;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ /*
+ * The first 1MB is not reported as TDX convertible memory.
+ * Although the first 1MB is always reserved and won't end up
+ * to the page allocator, it is still in memblock's memory
+ * regions. Skip them manually to exclude them as TDX memory.
+ */
+ start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /*
+ * Add the memory regions as TDX memory. The regions in
+ * memblock has already guaranteed they are in address
+ * ascending order and don't overlap.
+ */
+ ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ free_tdx_memlist(tmb_list);
+ return ret;
+}
+
+static int read_sys_metadata_field(u64 field_id, u64 *data)
+{
+ struct tdx_module_args args = {};
+ int ret;
+
+ /*
+ * TDH.SYS.RD -- reads one global metadata field
+ * - RDX (in): the field to read
+ * - R8 (out): the field data
+ */
+ args.rdx = field_id;
+ ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
+ if (ret)
+ return ret;
+
+ *data = args.r8;
+
+ return 0;
+}
+
+static int read_sys_metadata_field16(u64 field_id,
+ int offset,
+ struct tdx_tdmr_sysinfo *ts)
+{
+ u16 *ts_member = ((void *)ts) + offset;
+ u64 tmp;
+ int ret;
+
+ if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
+ MD_FIELD_ID_ELE_SIZE_16BIT))
+ return -EINVAL;
+
+ ret = read_sys_metadata_field(field_id, &tmp);
+ if (ret)
+ return ret;
+
+ *ts_member = tmp;
+
+ return 0;
+}
+
+struct field_mapping {
+ u64 field_id;
+ int offset;
+};
+
+#define TD_SYSINFO_MAP(_field_id, _offset) \
+ { .field_id = MD_FIELD_ID_##_field_id, \
+ .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) }
+
+/* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
+static const struct field_mapping fields[] = {
+ TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs),
+ TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
+ TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]),
+ TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]),
+ TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]),
+};
+
+static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
+{
+ int ret;
+ int i;
+
+ /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
+ for (i = 0; i < ARRAY_SIZE(fields); i++) {
+ ret = read_sys_metadata_field16(fields[i].field_id,
+ fields[i].offset,
+ tdmr_sysinfo);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Calculate the actual TDMR size */
+static int tdmr_size_single(u16 max_reserved_per_tdmr)
+{
+ int tdmr_sz;
+
+ /*
+ * The actual size of TDMR depends on the maximum
+ * number of reserved areas.
+ */
+ tdmr_sz = sizeof(struct tdmr_info);
+ tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
+
+ return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
+}
+
+static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
+ struct tdx_tdmr_sysinfo *tdmr_sysinfo)
+{
+ size_t tdmr_sz, tdmr_array_sz;
+ void *tdmr_array;
+
+ tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
+ tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
+
+ /*
+ * To keep things simple, allocate all TDMRs together.
+ * The buffer needs to be physically contiguous to make
+ * sure each TDMR is physically contiguous.
+ */
+ tdmr_array = alloc_pages_exact(tdmr_array_sz,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tdmr_array)
+ return -ENOMEM;
+
+ tdmr_list->tdmrs = tdmr_array;
+
+ /*
+ * Keep the size of TDMR to find the target TDMR
+ * at a given index in the TDMR list.
+ */
+ tdmr_list->tdmr_sz = tdmr_sz;
+ tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
+ tdmr_list->nr_consumed_tdmrs = 0;
+
+ return 0;
+}
+
+static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
+{
+ free_pages_exact(tdmr_list->tdmrs,
+ tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
+}
+
+/* Get the TDMR from the list at the given index. */
+static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
+ int idx)
+{
+ int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
+
+ return (void *)tdmr_list->tdmrs + tdmr_info_offset;
+}
+
+#define TDMR_ALIGNMENT SZ_1G
+#define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
+#define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
+
+static inline u64 tdmr_end(struct tdmr_info *tdmr)
+{
+ return tdmr->base + tdmr->size;
+}
+
+/*
+ * Take the memory referenced in @tmb_list and populate the
+ * preallocated @tdmr_list, following all the special alignment
+ * and size rules for TDMR.
+ */
+static int fill_out_tdmrs(struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list)
+{
+ struct tdx_memblock *tmb;
+ int tdmr_idx = 0;
+
+ /*
+ * Loop over TDX memory regions and fill out TDMRs to cover them.
+ * To keep it simple, always try to use one TDMR to cover one
+ * memory region.
+ *
+ * In practice TDX supports at least 64 TDMRs. A 2-socket system
+ * typically only consumes less than 10 of those. This code is
+ * dumb and simple and may use more TMDRs than is strictly
+ * required.
+ */
+ list_for_each_entry(tmb, tmb_list, list) {
+ struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
+ u64 start, end;
+
+ start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
+ end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
+
+ /*
+ * A valid size indicates the current TDMR has already
+ * been filled out to cover the previous memory region(s).
+ */
+ if (tdmr->size) {
+ /*
+ * Loop to the next if the current memory region
+ * has already been fully covered.
+ */
+ if (end <= tdmr_end(tdmr))
+ continue;
+
+ /* Otherwise, skip the already covered part. */
+ if (start < tdmr_end(tdmr))
+ start = tdmr_end(tdmr);
+
+ /*
+ * Create a new TDMR to cover the current memory
+ * region, or the remaining part of it.
+ */
+ tdmr_idx++;
+ if (tdmr_idx >= tdmr_list->max_tdmrs) {
+ pr_warn("initialization failed: TDMRs exhausted.\n");
+ return -ENOSPC;
+ }
+
+ tdmr = tdmr_entry(tdmr_list, tdmr_idx);
+ }
+
+ tdmr->base = start;
+ tdmr->size = end - start;
+ }
+
+ /* @tdmr_idx is always the index of the last valid TDMR. */
+ tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
+
+ /*
+ * Warn early that kernel is about to run out of TDMRs.
+ *
+ * This is an indication that TDMR allocation has to be
+ * reworked to be smarter to not run into an issue.
+ */
+ if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
+ pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
+ tdmr_list->nr_consumed_tdmrs,
+ tdmr_list->max_tdmrs);
+
+ return 0;
+}
+
+/*
+ * Calculate PAMT size given a TDMR and a page size. The returned
+ * PAMT size is always aligned up to 4K page boundary.
+ */
+static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
+ u16 pamt_entry_size)
+{
+ unsigned long pamt_sz, nr_pamt_entries;
+
+ switch (pgsz) {
+ case TDX_PS_4K:
+ nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
+ break;
+ case TDX_PS_2M:
+ nr_pamt_entries = tdmr->size >> PMD_SHIFT;
+ break;
+ case TDX_PS_1G:
+ nr_pamt_entries = tdmr->size >> PUD_SHIFT;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ pamt_sz = nr_pamt_entries * pamt_entry_size;
+ /* TDX requires PAMT size must be 4K aligned */
+ pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
+
+ return pamt_sz;
+}
+
+/*
+ * Locate a NUMA node which should hold the allocation of the @tdmr
+ * PAMT. This node will have some memory covered by the TDMR. The
+ * relative amount of memory covered is not considered.
+ */
+static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
+{
+ struct tdx_memblock *tmb;
+
+ /*
+ * A TDMR must cover at least part of one TMB. That TMB will end
+ * after the TDMR begins. But, that TMB may have started before
+ * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
+ * begins. Ignore 'tmb' start addresses. They are irrelevant.
+ */
+ list_for_each_entry(tmb, tmb_list, list) {
+ if (tmb->end_pfn > PHYS_PFN(tdmr->base))
+ return tmb->nid;
+ }
+
+ /*
+ * Fall back to allocating the TDMR's metadata from node 0 when
+ * no TDX memory block can be found. This should never happen
+ * since TDMRs originate from TDX memory blocks.
+ */
+ pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
+ tdmr->base, tdmr_end(tdmr));
+ return 0;
+}
+
+/*
+ * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
+ * within @tdmr, and set up PAMTs for @tdmr.
+ */
+static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
+ struct list_head *tmb_list,
+ u16 pamt_entry_size[])
+{
+ unsigned long pamt_base[TDX_PS_NR];
+ unsigned long pamt_size[TDX_PS_NR];
+ unsigned long tdmr_pamt_base;
+ unsigned long tdmr_pamt_size;
+ struct page *pamt;
+ int pgsz, nid;
+
+ nid = tdmr_get_nid(tdmr, tmb_list);
+
+ /*
+ * Calculate the PAMT size for each TDX supported page size
+ * and the total PAMT size.
+ */
+ tdmr_pamt_size = 0;
+ for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
+ pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
+ pamt_entry_size[pgsz]);
+ tdmr_pamt_size += pamt_size[pgsz];
+ }
+
+ /*
+ * Allocate one chunk of physically contiguous memory for all
+ * PAMTs. This helps minimize the PAMT's use of reserved areas
+ * in overlapped TDMRs.
+ */
+ pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
+ nid, &node_online_map);
+ if (!pamt)
+ return -ENOMEM;
+
+ /*
+ * Break the contiguous allocation back up into the
+ * individual PAMTs for each page size.
+ */
+ tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
+ for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
+ pamt_base[pgsz] = tdmr_pamt_base;
+ tdmr_pamt_base += pamt_size[pgsz];
+ }
+
+ tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
+ tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
+ tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
+ tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
+ tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
+ tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
+
+ return 0;
+}
+
+static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
+ unsigned long *pamt_size)
+{
+ unsigned long pamt_bs, pamt_sz;
+
+ /*
+ * The PAMT was allocated in one contiguous unit. The 4K PAMT
+ * should always point to the beginning of that allocation.
+ */
+ pamt_bs = tdmr->pamt_4k_base;
+ pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
+
+ WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
+
+ *pamt_base = pamt_bs;
+ *pamt_size = pamt_sz;
+}
+
+static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
+ void (*pamt_func)(unsigned long base, unsigned long size))
+{
+ unsigned long pamt_base, pamt_size;
+
+ tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
+
+ /* Do nothing if PAMT hasn't been allocated for this TDMR */
+ if (!pamt_size)
+ return;
+
+ if (WARN_ON_ONCE(!pamt_base))
+ return;
+
+ pamt_func(pamt_base, pamt_size);
+}
+
+static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
+{
+ free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
+}
+
+static void tdmr_free_pamt(struct tdmr_info *tdmr)
+{
+ tdmr_do_pamt_func(tdmr, free_pamt);
+}
+
+static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_free_pamt(tdmr_entry(tdmr_list, i));
+}
+
+/* Allocate and set up PAMTs for all TDMRs */
+static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
+ struct list_head *tmb_list,
+ u16 pamt_entry_size[])
+{
+ int i, ret = 0;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
+ pamt_entry_size);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ tdmrs_free_pamt_all(tdmr_list);
+ return ret;
+}
+
+/*
+ * Convert TDX private pages back to normal by using MOVDIR64B to
+ * clear these pages. Note this function doesn't flush cache of
+ * these TDX private pages. The caller should make sure of that.
+ */
+static void reset_tdx_pages(unsigned long base, unsigned long size)
+{
+ const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
+ unsigned long phys, end;
+
+ end = base + size;
+ for (phys = base; phys < end; phys += 64)
+ movdir64b(__va(phys), zero_page);
+
+ /*
+ * MOVDIR64B uses WC protocol. Use memory barrier to
+ * make sure any later user of these pages sees the
+ * updated data.
+ */
+ mb();
+}
+
+static void tdmr_reset_pamt(struct tdmr_info *tdmr)
+{
+ tdmr_do_pamt_func(tdmr, reset_tdx_pages);
+}
+
+static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
+}
+
+static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
+{
+ unsigned long pamt_size = 0;
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ unsigned long base, size;
+
+ tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
+ pamt_size += size;
+ }
+
+ return pamt_size / 1024;
+}
+
+static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
+ u64 size, u16 max_reserved_per_tdmr)
+{
+ struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
+ int idx = *p_idx;
+
+ /* Reserved area must be 4K aligned in offset and size */
+ if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
+ return -EINVAL;
+
+ if (idx >= max_reserved_per_tdmr) {
+ pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
+ tdmr->base, tdmr_end(tdmr));
+ return -ENOSPC;
+ }
+
+ /*
+ * Consume one reserved area per call. Make no effort to
+ * optimize or reduce the number of reserved areas which are
+ * consumed by contiguous reserved areas, for instance.
+ */
+ rsvd_areas[idx].offset = addr - tdmr->base;
+ rsvd_areas[idx].size = size;
+
+ *p_idx = idx + 1;
+
+ return 0;
+}
+
+/*
+ * Go through @tmb_list to find holes between memory areas. If any of
+ * those holes fall within @tdmr, set up a TDMR reserved area to cover
+ * the hole.
+ */
+static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
+ struct tdmr_info *tdmr,
+ int *rsvd_idx,
+ u16 max_reserved_per_tdmr)
+{
+ struct tdx_memblock *tmb;
+ u64 prev_end;
+ int ret;
+
+ /*
+ * Start looking for reserved blocks at the
+ * beginning of the TDMR.
+ */
+ prev_end = tdmr->base;
+ list_for_each_entry(tmb, tmb_list, list) {
+ u64 start, end;
+
+ start = PFN_PHYS(tmb->start_pfn);
+ end = PFN_PHYS(tmb->end_pfn);
+
+ /* Break if this region is after the TDMR */
+ if (start >= tdmr_end(tdmr))
+ break;
+
+ /* Exclude regions before this TDMR */
+ if (end < tdmr->base)
+ continue;
+
+ /*
+ * Skip over memory areas that
+ * have already been dealt with.
+ */
+ if (start <= prev_end) {
+ prev_end = end;
+ continue;
+ }
+
+ /* Add the hole before this region */
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
+ start - prev_end,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ prev_end = end;
+ }
+
+ /* Add the hole after the last region if it exists. */
+ if (prev_end < tdmr_end(tdmr)) {
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
+ tdmr_end(tdmr) - prev_end,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
+ * overlaps with @tdmr, set up a TDMR reserved area to cover the
+ * overlapping part.
+ */
+static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
+ struct tdmr_info *tdmr,
+ int *rsvd_idx,
+ u16 max_reserved_per_tdmr)
+{
+ int i, ret;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
+ unsigned long pamt_base, pamt_size, pamt_end;
+
+ tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
+ /* Each TDMR must already have PAMT allocated */
+ WARN_ON_ONCE(!pamt_size || !pamt_base);
+
+ pamt_end = pamt_base + pamt_size;
+ /* Skip PAMTs outside of the given TDMR */
+ if ((pamt_end <= tdmr->base) ||
+ (pamt_base >= tdmr_end(tdmr)))
+ continue;
+
+ /* Only mark the part within the TDMR as reserved */
+ if (pamt_base < tdmr->base)
+ pamt_base = tdmr->base;
+ if (pamt_end > tdmr_end(tdmr))
+ pamt_end = tdmr_end(tdmr);
+
+ ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
+ pamt_end - pamt_base,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Compare function called by sort() for TDMR reserved areas */
+static int rsvd_area_cmp_func(const void *a, const void *b)
+{
+ struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
+ struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
+
+ if (r1->offset + r1->size <= r2->offset)
+ return -1;
+ if (r1->offset >= r2->offset + r2->size)
+ return 1;
+
+ /* Reserved areas cannot overlap. The caller must guarantee. */
+ WARN_ON_ONCE(1);
+ return -1;
+}
+
+/*
+ * Populate reserved areas for the given @tdmr, including memory holes
+ * (via @tmb_list) and PAMTs (via @tdmr_list).
+ */
+static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
+ struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list,
+ u16 max_reserved_per_tdmr)
+{
+ int ret, rsvd_idx = 0;
+
+ ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
+ max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+
+ /* TDX requires reserved areas listed in address ascending order */
+ sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
+ rsvd_area_cmp_func, NULL);
+
+ return 0;
+}
+
+/*
+ * Populate reserved areas for all TDMRs in @tdmr_list, including memory
+ * holes (via @tmb_list) and PAMTs.
+ */
+static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
+ struct list_head *tmb_list,
+ u16 max_reserved_per_tdmr)
+{
+ int i;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ int ret;
+
+ ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
+ tmb_list, tdmr_list, max_reserved_per_tdmr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Construct a list of TDMRs on the preallocated space in @tdmr_list
+ * to cover all TDX memory regions in @tmb_list based on the TDX module
+ * TDMR global information in @tdmr_sysinfo.
+ */
+static int construct_tdmrs(struct list_head *tmb_list,
+ struct tdmr_info_list *tdmr_list,
+ struct tdx_tdmr_sysinfo *tdmr_sysinfo)
+{
+ int ret;
+
+ ret = fill_out_tdmrs(tmb_list, tdmr_list);
+ if (ret)
+ return ret;
+
+ ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
+ tdmr_sysinfo->pamt_entry_size);
+ if (ret)
+ return ret;
+
+ ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
+ tdmr_sysinfo->max_reserved_per_tdmr);
+ if (ret)
+ tdmrs_free_pamt_all(tdmr_list);
+
+ /*
+ * The tdmr_info_list is read-only from here on out.
+ * Ensure that these writes are seen by other CPUs.
+ * Pairs with a smp_rmb() in is_pamt_page().
+ */
+ smp_wmb();
+
+ return ret;
+}
+
+static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
+{
+ struct tdx_module_args args = {};
+ u64 *tdmr_pa_array;
+ size_t array_sz;
+ int i, ret;
+
+ /*
+ * TDMRs are passed to the TDX module via an array of physical
+ * addresses of each TDMR. The array itself also has certain
+ * alignment requirement.
+ */
+ array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
+ array_sz = roundup_pow_of_two(array_sz);
+ if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
+ array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
+
+ tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
+ if (!tdmr_pa_array)
+ return -ENOMEM;
+
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+ tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
+
+ args.rcx = __pa(tdmr_pa_array);
+ args.rdx = tdmr_list->nr_consumed_tdmrs;
+ args.r8 = global_keyid;
+ ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
+
+ /* Free the array as it is not required anymore. */
+ kfree(tdmr_pa_array);
+
+ return ret;
+}
+
+static int do_global_key_config(void *unused)
+{
+ struct tdx_module_args args = {};
+
+ return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
+}
+
+/*
+ * Attempt to configure the global KeyID on all physical packages.
+ *
+ * This requires running code on at least one CPU in each package.
+ * TDMR initialization) will fail will fail if any package in the
+ * system has no online CPUs.
+ *
+ * This code takes no affirmative steps to online CPUs. Callers (aka.
+ * KVM) can ensure success by ensuring sufficient CPUs are online and
+ * can run SEAMCALLs.
+ */
+static int config_global_keyid(void)
+{
+ cpumask_var_t packages;
+ int cpu, ret = -EINVAL;
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * Hardware doesn't guarantee cache coherency across different
+ * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
+ * (associated with KeyID 0) before the TDX module can use the
+ * global KeyID to access the PAMT. Given PAMTs are potentially
+ * large (~1/256th of system RAM), just use WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ for_each_online_cpu(cpu) {
+ /*
+ * The key configuration only needs to be done once per
+ * package and will return an error if configured more
+ * than once. Avoid doing it multiple times per package.
+ */
+ if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
+ packages))
+ continue;
+
+ /*
+ * TDH.SYS.KEY.CONFIG cannot run concurrently on
+ * different cpus. Do it one by one.
+ */
+ ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
+ if (ret)
+ break;
+ }
+
+ free_cpumask_var(packages);
+ return ret;
+}
+
+static int init_tdmr(struct tdmr_info *tdmr)
+{
+ u64 next;
+
+ /*
+ * Initializing a TDMR can be time consuming. To avoid long
+ * SEAMCALLs, the TDX module may only initialize a part of the
+ * TDMR in each call.
+ */
+ do {
+ struct tdx_module_args args = {
+ .rcx = tdmr->base,
+ };
+ int ret;
+
+ ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
+ if (ret)
+ return ret;
+ /*
+ * RDX contains 'next-to-initialize' address if
+ * TDH.SYS.TDMR.INIT did not fully complete and
+ * should be retried.
+ */
+ next = args.rdx;
+ cond_resched();
+ /* Keep making SEAMCALLs until the TDMR is done */
+ } while (next < tdmr->base + tdmr->size);
+
+ return 0;
+}
+
+static int init_tdmrs(struct tdmr_info_list *tdmr_list)
+{
+ int i;
+
+ /*
+ * This operation is costly. It can be parallelized,
+ * but keep it simple for now.
+ */
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ int ret;
+
+ ret = init_tdmr(tdmr_entry(tdmr_list, i));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int init_tdx_module(void)
+{
+ struct tdx_tdmr_sysinfo tdmr_sysinfo;
+ int ret;
+
+ /*
+ * To keep things simple, assume that all TDX-protected memory
+ * will come from the page allocator. Make sure all pages in the
+ * page allocator are TDX-usable memory.
+ *
+ * Build the list of "TDX-usable" memory regions which cover all
+ * pages in the page allocator to guarantee that. Do it while
+ * holding mem_hotplug_lock read-lock as the memory hotplug code
+ * path reads the @tdx_memlist to reject any new memory.
+ */
+ get_online_mems();
+
+ ret = build_tdx_memlist(&tdx_memlist);
+ if (ret)
+ goto out_put_tdxmem;
+
+ ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
+ if (ret)
+ goto err_free_tdxmem;
+
+ /* Allocate enough space for constructing TDMRs */
+ ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
+ if (ret)
+ goto err_free_tdxmem;
+
+ /* Cover all TDX-usable memory regions in TDMRs */
+ ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
+ if (ret)
+ goto err_free_tdmrs;
+
+ /* Pass the TDMRs and the global KeyID to the TDX module */
+ ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
+ if (ret)
+ goto err_free_pamts;
+
+ /* Config the key of global KeyID on all packages */
+ ret = config_global_keyid();
+ if (ret)
+ goto err_reset_pamts;
+
+ /* Initialize TDMRs to complete the TDX module initialization */
+ ret = init_tdmrs(&tdx_tdmr_list);
+ if (ret)
+ goto err_reset_pamts;
+
+ pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
+
+out_put_tdxmem:
+ /*
+ * @tdx_memlist is written here and read at memory hotplug time.
+ * Lock out memory hotplug code while building it.
+ */
+ put_online_mems();
+ return ret;
+
+err_reset_pamts:
+ /*
+ * Part of PAMTs may already have been initialized by the
+ * TDX module. Flush cache before returning PAMTs back
+ * to the kernel.
+ */
+ wbinvd_on_all_cpus();
+ /*
+ * According to the TDX hardware spec, if the platform
+ * doesn't have the "partial write machine check"
+ * erratum, any kernel read/write will never cause #MC
+ * in kernel space, thus it's OK to not convert PAMTs
+ * back to normal. But do the conversion anyway here
+ * as suggested by the TDX spec.
+ */
+ tdmrs_reset_pamt_all(&tdx_tdmr_list);
+err_free_pamts:
+ tdmrs_free_pamt_all(&tdx_tdmr_list);
+err_free_tdmrs:
+ free_tdmr_list(&tdx_tdmr_list);
+err_free_tdxmem:
+ free_tdx_memlist(&tdx_memlist);
+ goto out_put_tdxmem;
+}
+
+static int __tdx_enable(void)
+{
+ int ret;
+
+ ret = init_tdx_module();
+ if (ret) {
+ pr_err("module initialization failed (%d)\n", ret);
+ tdx_module_status = TDX_MODULE_ERROR;
+ return ret;
+ }
+
+ pr_info("module initialized\n");
+ tdx_module_status = TDX_MODULE_INITIALIZED;
+
+ return 0;
+}
+
+/**
+ * tdx_enable - Enable TDX module to make it ready to run TDX guests
+ *
+ * This function assumes the caller has: 1) held read lock of CPU hotplug
+ * lock to prevent any new cpu from becoming online; 2) done both VMXON
+ * and tdx_cpu_enable() on all online cpus.
+ *
+ * This function requires there's at least one online cpu for each CPU
+ * package to succeed.
+ *
+ * This function can be called in parallel by multiple callers.
+ *
+ * Return 0 if TDX is enabled successfully, otherwise error.
+ */
+int tdx_enable(void)
+{
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return -ENODEV;
+
+ lockdep_assert_cpus_held();
+
+ mutex_lock(&tdx_module_lock);
+
+ switch (tdx_module_status) {
+ case TDX_MODULE_UNINITIALIZED:
+ ret = __tdx_enable();
+ break;
+ case TDX_MODULE_INITIALIZED:
+ /* Already initialized, great, tell the caller. */
+ ret = 0;
+ break;
+ default:
+ /* Failed to initialize in the previous attempts */
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tdx_module_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tdx_enable);
+
+static bool is_pamt_page(unsigned long phys)
+{
+ struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
+ int i;
+
+ /* Ensure that all remote 'tdmr_list' writes are visible: */
+ smp_rmb();
+
+ /*
+ * The TDX module is no longer returning TDX_SYS_NOT_READY and
+ * is initialized. The 'tdmr_list' was initialized long ago
+ * and is now read-only.
+ */
+ for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+ unsigned long base, size;
+
+ tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
+
+ if (phys >= base && phys < (base + size))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return whether the memory page at the given physical address is TDX
+ * private memory or not.
+ *
+ * This can be imprecise for two known reasons:
+ * 1. PAMTs are private memory and exist before the TDX module is
+ * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
+ * short window that occurs once per boot.
+ * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
+ * page. However, the page can still cause #MC until it has been
+ * fully converted to shared using 64-byte writes like MOVDIR64B.
+ * Buggy hosts might still leave #MC-causing memory in place which
+ * this function can not detect.
+ */
+static bool paddr_is_tdx_private(unsigned long phys)
+{
+ struct tdx_module_args args = {
+ .rcx = phys & PAGE_MASK,
+ };
+ u64 sret;
+
+ if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
+ return false;
+
+ /* Get page type from the TDX module */
+ sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
+
+ /*
+ * The SEAMCALL will not return success unless there is a
+ * working, "ready" TDX module. Assume an absence of TDX
+ * private pages until SEAMCALL is working.
+ */
+ if (sret)
+ return false;
+
+ /*
+ * SEAMCALL was successful -- read page type (via RCX):
+ *
+ * - PT_NDA: Page is not used by the TDX module
+ * - PT_RSVD: Reserved for Non-TDX use
+ * - Others: Page is used by the TDX module
+ *
+ * Note PAMT pages are marked as PT_RSVD but they are also TDX
+ * private memory.
+ */
+ switch (args.rcx) {
+ case PT_NDA:
+ return false;
+ case PT_RSVD:
+ return is_pamt_page(phys);
+ default:
+ return true;
+ }
+}
+
+/*
+ * Some TDX-capable CPUs have an erratum. A write to TDX private
+ * memory poisons that memory, and a subsequent read of that memory
+ * triggers #MC.
+ *
+ * Help distinguish erratum-triggered #MCs from a normal hardware one.
+ * Just print additional message to show such #MC may be result of the
+ * erratum.
+ */
+const char *tdx_dump_mce_info(struct mce *m)
+{
+ if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
+ return NULL;
+
+ if (!paddr_is_tdx_private(m->addr))
+ return NULL;
+
+ return "TDX private memory error. Possible kernel bug.";
+}
+
+static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
+ u32 *nr_tdx_keyids)
+{
+ u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
+ int ret;
+
+ /*
+ * IA32_MKTME_KEYID_PARTIONING:
+ * Bit [31:0]: Number of MKTME KeyIDs.
+ * Bit [63:32]: Number of TDX private KeyIDs.
+ */
+ ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
+ &_nr_tdx_keyids);
+ if (ret || !_nr_tdx_keyids)
+ return -EINVAL;
+
+ /* TDX KeyIDs start after the last MKTME KeyID. */
+ _tdx_keyid_start = _nr_mktme_keyids + 1;
+
+ *tdx_keyid_start = _tdx_keyid_start;
+ *nr_tdx_keyids = _nr_tdx_keyids;
+
+ return 0;
+}
+
+static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct tdx_memblock *tmb;
+
+ /*
+ * This check assumes that the start_pfn<->end_pfn range does not
+ * cross multiple @tdx_memlist entries. A single memory online
+ * event across multiple memblocks (from which @tdx_memlist
+ * entries are derived at the time of module initialization) is
+ * not possible. This is because memory offline/online is done
+ * on granularity of 'struct memory_block', and the hotpluggable
+ * memory region (one memblock) must be multiple of memory_block.
+ */
+ list_for_each_entry(tmb, &tdx_memlist, list) {
+ if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
+ return true;
+ }
+ return false;
+}
+
+static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
+ void *v)
+{
+ struct memory_notify *mn = v;
+
+ if (action != MEM_GOING_ONLINE)
+ return NOTIFY_OK;
+
+ /*
+ * Empty list means TDX isn't enabled. Allow any memory
+ * to go online.
+ */
+ if (list_empty(&tdx_memlist))
+ return NOTIFY_OK;
+
+ /*
+ * The TDX memory configuration is static and can not be
+ * changed. Reject onlining any memory which is outside of
+ * the static configuration whether it supports TDX or not.
+ */
+ if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
+ return NOTIFY_OK;
+
+ return NOTIFY_BAD;
+}
+
+static struct notifier_block tdx_memory_nb = {
+ .notifier_call = tdx_memory_notifier,
+};
+
+static void __init check_tdx_erratum(void)
+{
+ /*
+ * These CPUs have an erratum. A partial write from non-TD
+ * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
+ * private memory poisons that memory, and a subsequent read of
+ * that memory triggers #MC.
+ */
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ case INTEL_FAM6_EMERALDRAPIDS_X:
+ setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
+ }
+}
+
+void __init tdx_init(void)
+{
+ u32 tdx_keyid_start, nr_tdx_keyids;
+ int err;
+
+ err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
+ if (err)
+ return;
+
+ pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
+ tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
+
+ /*
+ * The TDX module itself requires one 'global KeyID' to protect
+ * its metadata. If there's only one TDX KeyID, there won't be
+ * any left for TDX guests thus there's no point to enable TDX
+ * at all.
+ */
+ if (nr_tdx_keyids < 2) {
+ pr_err("initialization failed: too few private KeyIDs available.\n");
+ return;
+ }
+
+ /*
+ * At this point, hibernation_available() indicates whether or
+ * not hibernation support has been permanently disabled.
+ */
+ if (hibernation_available()) {
+ pr_err("initialization failed: Hibernation support is enabled\n");
+ return;
+ }
+
+ err = register_memory_notifier(&tdx_memory_nb);
+ if (err) {
+ pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
+ err);
+ return;
+ }
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
+ pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
+ acpi_suspend_lowlevel = NULL;
+#endif
+
+ /*
+ * Just use the first TDX KeyID as the 'global KeyID' and
+ * leave the rest for TDX guests.
+ */
+ tdx_global_keyid = tdx_keyid_start;
+ tdx_guest_keyid_start = tdx_keyid_start + 1;
+ tdx_nr_guest_keyids = nr_tdx_keyids - 1;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
+
+ check_tdx_erratum();
+}
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
new file mode 100644
index 000000000..b701f6948
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_VIRT_TDX_H
+#define _X86_VIRT_TDX_H
+
+#include <linux/bits.h>
+
+/*
+ * This file contains both macros and data structures defined by the TDX
+ * architecture and Linux defined software data structures and functions.
+ * The two should not be mixed together for better readability. The
+ * architectural definitions come first.
+ */
+
+/*
+ * TDX module SEAMCALL leaf functions
+ */
+#define TDH_PHYMEM_PAGE_RDMD 24
+#define TDH_SYS_KEY_CONFIG 31
+#define TDH_SYS_INIT 33
+#define TDH_SYS_RD 34
+#define TDH_SYS_LP_INIT 35
+#define TDH_SYS_TDMR_INIT 36
+#define TDH_SYS_CONFIG 45
+
+/* TDX page types */
+#define PT_NDA 0x0
+#define PT_RSVD 0x1
+
+/*
+ * Global scope metadata field ID.
+ *
+ * See Table "Global Scope Metadata", TDX module 1.5 ABI spec.
+ */
+#define MD_FIELD_ID_MAX_TDMRS 0x9100000100000008ULL
+#define MD_FIELD_ID_MAX_RESERVED_PER_TDMR 0x9100000100000009ULL
+#define MD_FIELD_ID_PAMT_4K_ENTRY_SIZE 0x9100000100000010ULL
+#define MD_FIELD_ID_PAMT_2M_ENTRY_SIZE 0x9100000100000011ULL
+#define MD_FIELD_ID_PAMT_1G_ENTRY_SIZE 0x9100000100000012ULL
+
+/*
+ * Sub-field definition of metadata field ID.
+ *
+ * See Table "MD_FIELD_ID (Metadata Field Identifier / Sequence Header)
+ * Definition", TDX module 1.5 ABI spec.
+ *
+ * - Bit 33:32: ELEMENT_SIZE_CODE -- size of a single element of metadata
+ *
+ * 0: 8 bits
+ * 1: 16 bits
+ * 2: 32 bits
+ * 3: 64 bits
+ */
+#define MD_FIELD_ID_ELE_SIZE_CODE(_field_id) \
+ (((_field_id) & GENMASK_ULL(33, 32)) >> 32)
+
+#define MD_FIELD_ID_ELE_SIZE_16BIT 1
+
+struct tdmr_reserved_area {
+ u64 offset;
+ u64 size;
+} __packed;
+
+#define TDMR_INFO_ALIGNMENT 512
+#define TDMR_INFO_PA_ARRAY_ALIGNMENT 512
+
+struct tdmr_info {
+ u64 base;
+ u64 size;
+ u64 pamt_1g_base;
+ u64 pamt_1g_size;
+ u64 pamt_2m_base;
+ u64 pamt_2m_size;
+ u64 pamt_4k_base;
+ u64 pamt_4k_size;
+ /*
+ * The actual number of reserved areas depends on the value of
+ * field MD_FIELD_ID_MAX_RESERVED_PER_TDMR in the TDX module
+ * global metadata.
+ */
+ DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas);
+} __packed __aligned(TDMR_INFO_ALIGNMENT);
+
+/*
+ * Do not put any hardware-defined TDX structure representations below
+ * this comment!
+ */
+
+/* Kernel defined TDX module status during module initialization. */
+enum tdx_module_status_t {
+ TDX_MODULE_UNINITIALIZED,
+ TDX_MODULE_INITIALIZED,
+ TDX_MODULE_ERROR
+};
+
+struct tdx_memblock {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ int nid;
+};
+
+/* "TDMR info" part of "Global Scope Metadata" for constructing TDMRs */
+struct tdx_tdmr_sysinfo {
+ u16 max_tdmrs;
+ u16 max_reserved_per_tdmr;
+ u16 pamt_entry_size[TDX_PS_NR];
+};
+
+/* Warn if kernel has less than TDMR_NR_WARN TDMRs after allocation */
+#define TDMR_NR_WARN 4
+
+struct tdmr_info_list {
+ void *tdmrs; /* Flexible array to hold 'tdmr_info's */
+ int nr_consumed_tdmrs; /* How many 'tdmr_info's are in use */
+
+ /* Metadata for finding target 'tdmr_info' and freeing @tdmrs */
+ int tdmr_sz; /* Size of one 'tdmr_info' */
+ int max_tdmrs; /* How many 'tdmr_info's are allocated */
+};
+
+#endif
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
index 49a54356a..016a2a1ec 100644
--- a/arch/x86/virt/vmx/tdx/tdxcall.S
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -1,5 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include <asm/asm.h>
#include <asm/tdx.h>
/*
@@ -16,35 +18,75 @@
* TDX module and hypercalls to the VMM.
* SEAMCALL - used by TDX hosts to make requests to the
* TDX module.
+ *
+ *-------------------------------------------------------------------------
+ * TDCALL/SEAMCALL ABI:
+ *-------------------------------------------------------------------------
+ * Input Registers:
+ *
+ * RAX - TDCALL/SEAMCALL Leaf number.
+ * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific input registers.
+ *
+ * Output Registers:
+ *
+ * RAX - TDCALL/SEAMCALL instruction error code.
+ * RCX,RDX,RDI,RSI,RBX,R8-R15 - TDCALL/SEAMCALL Leaf specific output registers.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * So while the common core (RAX,RCX,RDX,R8-R11) fits nicely in the
+ * callee-clobbered registers and even leaves RDI,RSI free to act as a
+ * base pointer, some leafs (e.g., VP.ENTER) make a giant mess of things.
+ *
+ * For simplicity, assume that anything that needs the callee-saved regs
+ * also tramples on RDI,RSI. This isn't strictly true, see for example
+ * TDH.EXPORT.MEM.
*/
-.macro TDX_MODULE_CALL host:req
- /*
- * R12 will be used as temporary storage for struct tdx_module_output
- * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL
- * services supported by this function, it can be reused.
- */
+.macro TDX_MODULE_CALL host:req ret=0 saved=0
+ FRAME_BEGIN
- /* Callee saved, so preserve it */
- push %r12
+ /* Move Leaf ID to RAX */
+ mov %rdi, %rax
+
+ /* Move other input regs from 'struct tdx_module_args' */
+ movq TDX_MODULE_rcx(%rsi), %rcx
+ movq TDX_MODULE_rdx(%rsi), %rdx
+ movq TDX_MODULE_r8(%rsi), %r8
+ movq TDX_MODULE_r9(%rsi), %r9
+ movq TDX_MODULE_r10(%rsi), %r10
+ movq TDX_MODULE_r11(%rsi), %r11
+.if \saved
/*
- * Push output pointer to stack.
- * After the operation, it will be fetched into R12 register.
+ * Move additional input regs from the structure. For simplicity
+ * assume that anything needs the callee-saved regs also tramples
+ * on RDI/RSI (see VP.ENTER).
*/
- push %r9
+ /* Save those callee-saved GPRs as mandated by the x86_64 ABI */
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
- /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */
- /* Move Leaf ID to RAX */
- mov %rdi, %rax
- /* Move input 4 to R9 */
- mov %r8, %r9
- /* Move input 3 to R8 */
- mov %rcx, %r8
- /* Move input 1 to RCX */
- mov %rsi, %rcx
- /* Leave input param 2 in RDX */
-
- .if \host
+ movq TDX_MODULE_r12(%rsi), %r12
+ movq TDX_MODULE_r13(%rsi), %r13
+ movq TDX_MODULE_r14(%rsi), %r14
+ movq TDX_MODULE_r15(%rsi), %r15
+ movq TDX_MODULE_rbx(%rsi), %rbx
+
+.if \ret
+ /* Save the structure pointer as RSI is about to be clobbered */
+ pushq %rsi
+.endif
+
+ movq TDX_MODULE_rdi(%rsi), %rdi
+ /* RSI needs to be done at last */
+ movq TDX_MODULE_rsi(%rsi), %rsi
+.endif /* \saved */
+
+.if \host
+.Lseamcall\@:
seamcall
/*
* SEAMCALL instruction is essentially a VMExit from VMX root
@@ -57,40 +99,122 @@
* This value will never be used as actual SEAMCALL error code as
* it is from the Reserved status code class.
*/
- jnc .Lno_vmfailinvalid
- mov $TDX_SEAMCALL_VMFAILINVALID, %rax
-.Lno_vmfailinvalid:
-
- .else
+ jc .Lseamcall_vmfailinvalid\@
+.else
tdcall
- .endif
+.endif
+.if \ret
+.if \saved
/*
- * Fetch output pointer from stack to R12 (It is used
- * as temporary storage)
+ * Restore the structure from stack to save the output registers
+ *
+ * In case of VP.ENTER returns due to TDVMCALL, all registers are
+ * valid thus no register can be used as spare to restore the
+ * structure from the stack (see "TDH.VP.ENTER Output Operands
+ * Definition on TDCALL(TDG.VP.VMCALL) Following a TD Entry").
+ * For this case, need to make one register as spare by saving it
+ * to the stack and then manually load the structure pointer to
+ * the spare register.
+ *
+ * Note for other TDCALLs/SEAMCALLs there are spare registers
+ * thus no need for such hack but just use this for all.
*/
- pop %r12
+ pushq %rax /* save the TDCALL/SEAMCALL return code */
+ movq 8(%rsp), %rax /* restore the structure pointer */
+ movq %rsi, TDX_MODULE_rsi(%rax) /* save RSI */
+ popq %rax /* restore the return code */
+ popq %rsi /* pop the structure pointer */
+
+ /* Copy additional output regs to the structure */
+ movq %r12, TDX_MODULE_r12(%rsi)
+ movq %r13, TDX_MODULE_r13(%rsi)
+ movq %r14, TDX_MODULE_r14(%rsi)
+ movq %r15, TDX_MODULE_r15(%rsi)
+ movq %rbx, TDX_MODULE_rbx(%rsi)
+ movq %rdi, TDX_MODULE_rdi(%rsi)
+.endif /* \saved */
+ /* Copy output registers to the structure */
+ movq %rcx, TDX_MODULE_rcx(%rsi)
+ movq %rdx, TDX_MODULE_rdx(%rsi)
+ movq %r8, TDX_MODULE_r8(%rsi)
+ movq %r9, TDX_MODULE_r9(%rsi)
+ movq %r10, TDX_MODULE_r10(%rsi)
+ movq %r11, TDX_MODULE_r11(%rsi)
+.endif /* \ret */
+
+.if \saved && \ret
/*
- * Since this macro can be invoked with NULL as an output pointer,
- * check if caller provided an output struct before storing output
- * registers.
+ * Clear registers shared by guest for VP.VMCALL/VP.ENTER to prevent
+ * speculative use of guest's/VMM's values, including those are
+ * restored from the stack.
+ *
+ * See arch/x86/kvm/vmx/vmenter.S:
*
- * Update output registers, even if the call failed (RAX != 0).
- * Other registers may contain details of the failure.
+ * In theory, a L1 cache miss when restoring register from stack
+ * could lead to speculative execution with guest's values.
+ *
+ * Note: RBP/RSP are not used as shared register. RSI has been
+ * restored already.
+ *
+ * XOR is cheap, thus unconditionally do for all leafs.
*/
- test %r12, %r12
- jz .Lno_output_struct
-
- /* Copy result registers to output struct: */
- movq %rcx, TDX_MODULE_rcx(%r12)
- movq %rdx, TDX_MODULE_rdx(%r12)
- movq %r8, TDX_MODULE_r8(%r12)
- movq %r9, TDX_MODULE_r9(%r12)
- movq %r10, TDX_MODULE_r10(%r12)
- movq %r11, TDX_MODULE_r11(%r12)
-
-.Lno_output_struct:
- /* Restore the state of R12 register */
- pop %r12
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
+ xorl %ebx, %ebx
+ xorl %edi, %edi
+.endif /* \ret && \host */
+
+.if \host
+.Lout\@:
+.endif
+
+.if \saved
+ /* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+.endif /* \saved */
+
+ FRAME_END
+ RET
+
+.if \host
+.Lseamcall_vmfailinvalid\@:
+ mov $TDX_SEAMCALL_VMFAILINVALID, %rax
+ jmp .Lseamcall_fail\@
+
+.Lseamcall_trap\@:
+ /*
+ * SEAMCALL caused #GP or #UD. By reaching here RAX contains
+ * the trap number. Convert the trap number to the TDX error
+ * code by setting TDX_SW_ERROR to the high 32-bits of RAX.
+ *
+ * Note cannot OR TDX_SW_ERROR directly to RAX as OR instruction
+ * only accepts 32-bit immediate at most.
+ */
+ movq $TDX_SW_ERROR, %rdi
+ orq %rdi, %rax
+
+.Lseamcall_fail\@:
+.if \ret && \saved
+ /* pop the unused structure pointer back to RSI */
+ popq %rsi
+.endif
+ jmp .Lout\@
+
+ _ASM_EXTABLE_FAULT(.Lseamcall\@, .Lseamcall_trap\@)
+.endif /* \host */
+
.endm
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7ad91225f..9dd5490b3 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -33,13 +33,13 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
return 0xfd;
}
-static u32 xen_set_apic_id(unsigned int x)
+static u32 xen_set_apic_id(u32 x)
{
WARN_ON(1);
return x;
}
-static unsigned int xen_get_apic_id(unsigned long x)
+static u32 xen_get_apic_id(u32 x)
{
return ((x)>>24) & 0xFFu;
}
@@ -110,15 +110,15 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
return xen_pv_domain();
}
-static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 xen_phys_pkg_id(u32 initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
-static int xen_cpu_present_to_apicid(int cpu)
+static u32 xen_cpu_present_to_apicid(int cpu)
{
if (cpu_present(cpu))
- return cpu_data(cpu).apicid;
+ return cpu_data(cpu).topo.apicid;
else
return BAD_APICID;
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3c61bb98c..a01ca255b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/panic_notifier.h>
@@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
+{
+ unsigned int i;
+
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index ada3868c0..c28f073c1 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/mm.h>
#include <xen/hvc-console.h>
@@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
}
boot_params_p->e820_entries = memmap.nr_entries;
}
+
+/*
+ * Reserve e820 UNUSABLE regions to inflate the memory balloon.
+ *
+ * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
+ * located as the same place as in the native memory map, but since dom0 gets
+ * less memory than the total amount of host RAM the ranges that can't be
+ * populated are converted from RAM -> UNUSABLE. Use such regions (up to the
+ * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
+ * boot. Doing so prevents the guest (even if just temporary) from using holes
+ * in the memory map in order to map grants or foreign addresses, and
+ * hopefully limits the risk of a clash with a device MMIO region. Ideally the
+ * hypervisor should notify us which memory ranges are suitable for creating
+ * foreign mappings, but that's not yet implemented.
+ */
+void __init xen_reserve_extra_memory(struct boot_params *bootp)
+{
+ unsigned int i, ram_pages = 0, extra_pages;
+
+ for (i = 0; i < bootp->e820_entries; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+
+ if (e->type != E820_TYPE_RAM)
+ continue;
+ ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
+ }
+
+ /* Max amount of extra memory. */
+ extra_pages = EXTRA_MEM_RATIO * ram_pages;
+
+ /*
+ * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
+ * purposes.
+ */
+ for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+ unsigned long pages;
+
+ if (e->type != E820_TYPE_UNUSABLE)
+ continue;
+
+ pages = min(extra_pages,
+ PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
+
+ if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
+ struct boot_e820_entry *next;
+
+ if (bootp->e820_entries ==
+ ARRAY_SIZE(bootp->e820_table))
+ /* No space left to split - skip region. */
+ continue;
+
+ /* Split entry. */
+ next = e + 1;
+ memmove(next, e,
+ (bootp->e820_entries - i) * sizeof(*e));
+ bootp->e820_entries++;
+ next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
+ e->size = next->addr - e->addr;
+ next->size -= e->size;
+ }
+ e->type = E820_TYPE_RAM;
+ extra_pages -= pages;
+
+ xen_add_extra_mem(PFN_UP(e->addr), pages);
+ }
+}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6092fea7d..39982f955 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = {
/* Initial interrupt flag handling only called while interrupts off. */
.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
+ .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index b6830554f..994aef447 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -34,7 +34,7 @@
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use. This menas that Xen can be assured
+ * when it is not actively in use. This means that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
@@ -1082,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
pmd_t *pmd_tbl;
int i;
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
return;
@@ -1863,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pud_present(pud))
return 0;
pa = pud_val(pud) & PTE_PFN_MASK;
- if (pud_large(pud))
+ if (pud_leaf(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b3e379610..380591028 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,9 +38,6 @@
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
-/* Amount of extra memory space we add to the e820 ranges */
-struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
@@ -64,18 +61,6 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
-/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO (10)
-
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void)
xen_512gb_limit = val;
}
-static void __init xen_add_extra_mem(unsigned long start_pfn,
- unsigned long n_pfns)
-{
- int i;
-
- /*
- * No need to check for zero size, should happen rarely and will only
- * write a new entry regarded to be unused due to zero size.
- */
- for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- /* Add new region. */
- if (xen_extra_mem[i].n_pfns == 0) {
- xen_extra_mem[i].start_pfn = start_pfn;
- xen_extra_mem[i].n_pfns = n_pfns;
- break;
- }
- /* Append to existing region. */
- if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
- start_pfn) {
- xen_extra_mem[i].n_pfns += n_pfns;
- break;
- }
- }
- if (i == XEN_EXTRA_MEM_MAX_REGIONS)
- printk(KERN_WARNING "Warning: not enough extra memory regions\n");
-
- memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
-}
-
static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4b0d6fff8..1fb9a1644 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -65,6 +65,8 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ if (!resched_name)
+ goto fail_mem;
per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
@@ -77,6 +79,8 @@ int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_resched_irq, cpu).irq = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
@@ -90,6 +94,9 @@ int xen_smp_intr_init(unsigned int cpu)
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ if (!debug_name)
+ goto fail_mem;
+
per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
@@ -101,6 +108,9 @@ int xen_smp_intr_init(unsigned int cpu)
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+
per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
@@ -114,6 +124,8 @@ int xen_smp_intr_init(unsigned int cpu)
return 0;
+ fail_mem:
+ rc = -ENOMEM;
fail:
xen_smp_intr_free(cpu);
return rc;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index a87ab3688..79cf93f2c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled);
static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
#endif
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+
#endif /* XEN_OPS_H */