diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:22 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:22 +0000 |
commit | b20732900e4636a467c0183a47f7396700f5f743 (patch) | |
tree | 42f079ff82e701ebcb76829974b4caca3e5b6798 /arch/x86/kernel | |
parent | Adding upstream version 6.8.12. (diff) | |
download | linux-b20732900e4636a467c0183a47f7396700f5f743.tar.xz linux-b20732900e4636a467c0183a47f7396700f5f743.zip |
Adding upstream version 6.9.7.upstream/6.9.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/kernel')
104 files changed, 3228 insertions, 2285 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab9..74077694da 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,7 @@ KASAN_SANITIZE_sev.o := n KCSAN_SANITIZE := n KMSAN_SANITIZE_head$(BITS).o := n KMSAN_SANITIZE_nmi.o := n +KMSAN_SANITIZE_sev.o := n # If instrumentation of the following files is enabled, boot hangs during # first second. @@ -48,6 +49,7 @@ obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_X86_FRED) += fred.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o @@ -98,11 +100,11 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o -obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_X86_32) += doublefault_32.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 85a3ce2a36..4bf82dbd2a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -164,35 +164,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return 0; } -/** - * acpi_register_lapic - register a local apic and generates a logic cpu number - * @id: local apic id to register - * @acpiid: ACPI id to register - * @enabled: this cpu is enabled or not - * - * Returns the logic cpu number which maps to the local apic - */ -static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) -{ - int cpu; - - if (id >= MAX_LOCAL_APIC) { - pr_info("skipped apicid that is too big\n"); - return -EINVAL; - } - - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - - cpu = generic_processor_info(id); - if (cpu >= 0) - early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; - - return cpu; -} - static bool __init acpi_is_processor_usable(u32 lapic_flags) { if (lapic_flags & ACPI_MADT_ENABLED) @@ -254,7 +225,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) return 0; } - acpi_register_lapic(apic_id, processor->uid, enabled); + topology_register_apic(apic_id, processor->uid, enabled); #else pr_warn("x2apic entry ignored\n"); #endif @@ -289,9 +260,9 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->id, /* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); has_lapic_cpus = true; return 0; @@ -309,9 +280,9 @@ acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) acpi_table_print_madt_entry(&header->common); - acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); return 0; } @@ -844,12 +815,10 @@ static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) return 0; } -int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, - int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu) { - int cpu; + int cpu = topology_hotplug_apic(physid, acpi_id); - cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info("Unable to map lapic to logical cpu number\n"); return cpu; @@ -868,15 +837,11 @@ int acpi_unmap_cpu(int cpu) #ifdef CONFIG_ACPI_NUMA set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); #endif - - per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; - set_cpu_present(cpu, false); - num_processors--; - - return (0); + topology_hotunplug_apic(cpu); + return 0; } EXPORT_SYMBOL(acpi_unmap_cpu); -#endif /* CONFIG_ACPI_HOTPLUG_CPU */ +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) { diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 401808b47a..f3ffd0a3a0 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -131,8 +131,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); /* Check whether this particular cx_type (in CST) is supported or not */ - cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & - MWAIT_CSTATE_MASK) + 1; + cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK; edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index d5d8a352ea..94ff83f3d3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,7 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) - movq saved_magic, %rax + movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax je 2f @@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64) movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_rsp, %rsp + movq saved_rsp(%rip), %rsp - movq saved_rbx, %rbx - movq saved_rdi, %rdi - movq saved_rsi, %rsi - movq saved_rbp, %rbp + movq saved_rbx(%rip), %rbx + movq saved_rdi(%rip), %rdi + movq saved_rsi(%rip), %rsi + movq saved_rbp(%rip), %rbp - movq saved_rip, %rax + movq saved_rip(%rip), %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel) movq $.Lresume_point, saved_rip(%rip) - movq %rsp, saved_rsp - movq %rbp, saved_rbp - movq %rbx, saved_rbx - movq %rdi, saved_rdi - movq %rsi, saved_rsi + movq %rsp, saved_rsp(%rip) + movq %rbp, saved_rbp(%rip) + movq %rbx, saved_rbx(%rip) + movq %rdi, saved_rdi(%rip) + movq %rsi, saved_rsi(%rip) addq $8, %rsp movl $3, %edi diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1d85cb7071..45a280f216 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define DA_ENDBR 0x08 #define DA_SMP 0x10 -static unsigned int __initdata_or_module debug_alternative; +static unsigned int debug_alternative; static int __init debug_alt(char *str) { @@ -133,7 +133,7 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ -static void __init_or_module add_nop(u8 *instr, unsigned int len) +static void add_nop(u8 *instr, unsigned int len) { u8 *target = instr + len; @@ -206,7 +206,7 @@ static int skip_nops(u8 *instr, int offset, int len) * Optimize a sequence of NOPs, possibly preceded by an unconditional jump * to the end of the NOP sequence into a single NOP. */ -static bool __init_or_module +static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) { int i = *next - insn->length; @@ -335,8 +335,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -static void __init_or_module noinline -apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { int prev, target = 0; @@ -545,7 +544,7 @@ static inline bool is_jcc32(struct insn *insn) return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -709,8 +708,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old - * or SLS isn't enabled, we still need an INT3 after indirect JMPs - * even on Intel. + * or MITIGATION_SLS isn't enabled, we still need an INT3 after + * indirect JMPs even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; @@ -770,7 +769,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. @@ -843,14 +842,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) } #else void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_MITIGATION_RETHUNK */ -#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ +#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT @@ -1805,7 +1804,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(smp_processor_id()); + leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 053f6dcc6b..027a8c7a2c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -95,6 +95,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, {} @@ -214,7 +215,14 @@ out: int amd_smn_read(u16 node, u32 address, u32 *value) { - return __amd_smn_rw(node, address, value, false); + int err = __amd_smn_rw(node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; } EXPORT_SYMBOL_GPL(amd_smn_read); @@ -386,7 +394,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -400,7 +408,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu)); unsigned int reg; int cuid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a04a5163e5..803dcfb0e3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -19,6 +19,7 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> +#include <linux/bitmap.h> #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/memblock.h> @@ -67,10 +68,6 @@ #include "local.h" -unsigned int num_processors; - -unsigned disabled_cpus; - /* Processor that is doing the boot up */ u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); @@ -78,18 +75,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * Bitmask of physically existing CPUs: - */ -physid_mask_t phys_cpu_present_map; - -/* - * Processor to be disabled specified by kernel parameter - * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to - * avoid undefined behaviour caused by sending INIT from AP to BSP. - */ -static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID; - -/* * This variable controls which CPUs receive external NMIs. By default, * external NMIs are delivered only to the BSP. */ @@ -108,14 +93,6 @@ static inline bool apic_accessible(void) return x2apic_mode || apic_mmio_base; } -/* - * Map cpu index to physical APIC ID - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); - #ifdef CONFIG_X86_32 /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -261,16 +238,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ @@ -1549,9 +1516,6 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* Validate that the APIC is registered if required */ - BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); - /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel @@ -1690,8 +1654,6 @@ void apic_ap_setup(void) end_local_APIC_setup(); } -static __init void cpu_set_boot_apic(void); - static __init void apic_read_boot_cpu_id(bool x2apic) { /* @@ -1706,7 +1668,8 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_physical_apicid = read_apic_id(); boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } - cpu_set_boot_apic(); + topology_register_boot_apic(boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC @@ -2096,7 +2059,6 @@ void __init init_apic_mappings(void) pr_info("APIC: disable apic facility\n"); apic_disable(); } - num_processors = 1; } } @@ -2311,155 +2273,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -/* - * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated - * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, - * so the maximum of nr_logical_cpuids is nr_cpu_ids. - * - * NOTE: Reserve 0 for BSP. - */ -static int nr_logical_cpuids = 1; - -/* - * Used to store mapping between logical CPU IDs and APIC IDs. - */ -u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, }; - -bool arch_match_cpu_phys_id(int cpu, u64 phys_id) -{ - return phys_id == (u64)cpuid_to_apicid[cpu]; -} - -#ifdef CONFIG_SMP -static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) -{ - /* Isolate the SMT bit(s) in the APICID and check for 0 */ - u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - - if (smp_num_siblings == 1 || !(apicid & mask)) - cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); -} - -/* - * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid - * during early boot. Initialize the primary thread mask before SMP - * bringup. - */ -static int __init smp_init_primary_thread_mask(void) -{ - unsigned int cpu; - - /* - * XEN/PV provides either none or useless topology information. - * Pretend that all vCPUs are primary threads. - */ - if (xen_pv_domain()) { - cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); - return 0; - } - - for (cpu = 0; cpu < nr_logical_cpuids; cpu++) - cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); - return 0; -} -early_initcall(smp_init_primary_thread_mask); -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif - -/* - * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids - * and cpuid_to_apicid[] synchronized. - */ -static int allocate_logical_cpuid(int apicid) -{ - int i; - - /* - * cpuid <-> apicid mapping is persistent, so when a cpu is up, - * check if the kernel has allocated a cpuid for it. - */ - for (i = 0; i < nr_logical_cpuids; i++) { - if (cpuid_to_apicid[i] == apicid) - return i; - } - - /* Allocate a new cpuid. */ - if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " - "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids, nr_logical_cpuids, apicid); - return -EINVAL; - } - - cpuid_to_apicid[nr_logical_cpuids] = apicid; - return nr_logical_cpuids++; -} - -static void cpu_update_apic(int cpu, u32 apicid) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); -} - -static __init void cpu_set_boot_apic(void) -{ - cpuid_to_apicid[0] = boot_cpu_physical_apicid; - cpu_update_apic(0, boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); -} - -int generic_processor_info(int apicid) -{ - int cpu, max = nr_cpu_ids; - - /* The boot CPU must be set before MADT/MPTABLE parsing happens */ - if (cpuid_to_apicid[0] == BAD_APICID) - panic("Boot CPU APIC not registered yet\n"); - - if (apicid == boot_cpu_physical_apicid) - return 0; - - if (disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; - - pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENODEV; - } - - if (num_processors >= nr_cpu_ids) { - int thiscpu = max + disabled_cpus; - - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " - "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); - - disabled_cpus++; - return -EINVAL; - } - - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - - cpu_update_apic(cpu, apicid); - return cpu; -} - - void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) { @@ -2502,10 +2315,7 @@ EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); static void __init apic_bsp_up_setup(void) { -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + reset_phys_cpu_present_map(boot_cpu_physical_apicid); } /** @@ -2851,15 +2661,6 @@ static int __init lapic_insert_resource(void) */ late_initcall(lapic_insert_resource); -static int __init apic_set_disabled_cpu_apicid(char *arg) -{ - if (!arg || !get_option(&arg, &disabled_cpu_apicid)) - return -EINVAL; - - return 0; -} -early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); - static int __init apic_set_extnmi(char *arg) { if (!arg) diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 8a00141073..9ef3be8668 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -18,16 +18,6 @@ u32 apic_flat_calc_apicid(unsigned int cpu) return 1U << cpu; } -bool default_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return physid_isset(apicid, *map); -} - -void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - *retmap = *phys_map; -} - u32 default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -37,11 +27,6 @@ u32 default_cpu_present_to_apicid(int mps_cpu) } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -bool default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - /* * Set up the logical destination ID when the APIC operates in logical * destination mode. diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index b295a056a4..f37ad3392f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -61,16 +61,6 @@ static u32 flat_get_apic_id(u32 x) return (x >> 24) & 0xFF; } -static u32 set_apic_id(u32 id) -{ - return (id & 0xFF) << 24; -} - -static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static int flat_probe(void) { return 1; @@ -80,7 +70,6 @@ static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = true, @@ -88,11 +77,9 @@ static struct apic apic_flat __ro_after_init = { .init_apic_ldr = default_init_apic_ldr, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, @@ -151,18 +138,15 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 9f1d553eb4..b5bb7a2e83 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -29,7 +29,6 @@ static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } static u64 noop_apic_icr_read(void) { return 0; } -static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; } static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } @@ -51,12 +50,8 @@ struct apic apic_noop __ro_after_init = { .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = noop_phys_pkg_id, - .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 7d0c51b9d3..16410f087b 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,11 +38,6 @@ static u32 numachip1_get_apic_id(u32 x) return id; } -static u32 numachip1_set_apic_id(u32 id) -{ - return (id & 0xff) << 24; -} - static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; @@ -51,16 +46,6 @@ static u32 numachip2_get_apic_id(u32 x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static u32 numachip2_set_apic_id(u32 id) -{ - return id << 24; -} - -static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static void numachip1_apic_icr_write(int apicid, unsigned int val) { write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); @@ -227,11 +212,9 @@ static const struct apic apic_numachip1 __refconst = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, - .set_apic_id = numachip1_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -263,11 +246,9 @@ static const struct apic apic_numachip2 __refconst = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, - .set_apic_id = numachip2_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 5a0d60b38e..9285d500d5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,22 +18,6 @@ static u32 bigsmp_get_apic_id(u32 x) return (x >> 24) & 0xFF; } -static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return false; -} - -static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0xFFL, retmap); -} - -static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -84,14 +68,10 @@ static struct apic apic_bigsmp __ro_after_init = { .disable_esr = 1, - .check_apicid_used = bigsmp_check_apicid_used, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = bigsmp_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 40c7cf180c..477b740b2f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1458,20 +1458,20 @@ void restore_boot_irq_mode(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -void __init setup_ioapic_ids_from_mpc_nocheck(void) +static void __init setup_ioapic_ids_from_mpc_nocheck(void) { + DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int ioapic_idx; - int i; unsigned char old_id; unsigned long flags; + int ioapic_idx, i; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + copy_phys_cpu_present_map(phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -1484,11 +1484,10 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) old_id = mpc_ioapic_id(ioapic_idx); - if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); + if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) { + pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; } @@ -1497,23 +1496,21 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(&phys_id_present_map, - mpc_ioapic_id(ioapic_idx))) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) + if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) { + pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < broadcast_id; i++) + if (!test_bit(i, phys_id_present_map)) break; - if (i >= get_physical_broadcast()) + if (i >= broadcast_id) panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); + pr_err("... fixing up to %d. (tell your hw vendor)\n", i); + set_bit(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", mpc_ioapic_id(ioapic_idx)); - physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); + set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* @@ -2209,7 +2206,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2354,7 +2351,7 @@ static int mp_irqdomain_create(int ioapic) fwspec.param_count = 1; fwspec.param[0] = mpc_ioapic_id(ioapic); - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { if (!cfg->dev) irq_domain_free_fwnode(fn); @@ -2494,56 +2491,41 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) #ifdef CONFIG_X86_32 static int io_apic_get_unique_id(int ioapic, int apic_id) { + static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; unsigned long flags; int i = 0; - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + /* Initialize the ID map */ + if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC)) + copy_phys_cpu_present_map(apic_id_map); raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); + if (apic_id >= broadcast_id) { + pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n", + ioapic, apic_id, reg_00.bits.ID); apic_id = reg_00.bits.ID; } - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (apic->check_apicid_used(&apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(&apic_id_map, i)) + /* Every APIC in a system must have a unique ID */ + if (test_bit(apic_id, apic_id_map)) { + for (i = 0; i < broadcast_id; i++) { + if (!test_bit(i, apic_id_map)) break; } - if (i == get_physical_broadcast()) + if (i == broadcast_id) panic("Max apic_id exceeded!\n"); - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - + pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i); apic_id = i; } - physid_set_mask_of_physid(apic_id, &tmp); - physids_or(apic_id_map, apic_id_map, tmp); + set_bit(apic_id, apic_id_map); if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; @@ -2569,11 +2551,9 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(boot_cpu_apic_version)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); - else - return id; + return id; } #else static u8 io_apic_unique_id(int idx, u8 id) diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 9ea6186ea8..842fe28496 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -16,8 +16,6 @@ /* X2APIC */ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); u32 x2apic_get_apic_id(u32 id); -u32 x2apic_set_apic_id(u32 id); -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb); void x2apic_send_IPI_all(int vector); void x2apic_send_IPI_allbutself(int vector); @@ -63,9 +61,6 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); -bool default_apic_id_registered(void); -bool default_check_apicid_used(physid_mask_t *map, u32 apicid); - #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index c0f78059f0..f75ee345c0 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -18,11 +18,6 @@ #include "local.h" -static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static u32 default_get_apic_id(u32 x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); @@ -43,17 +38,13 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .apic_id_registered = default_apic_id_registered, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = default_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 185738c727..29cd0543fc 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1036,7 +1036,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd) add_timer_on(&cl->timer, cpu); } } else { - apicd->prev_vector = 0; + pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); } @@ -1073,6 +1074,7 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { + unsigned int cpu = smp_processor_id(); struct apic_chip_data *apicd; struct irq_data *irqd; unsigned int vector; @@ -1097,10 +1099,11 @@ void irq_force_complete_move(struct irq_desc *desc) goto unlock; /* - * If prev_vector is empty, no action required. + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. */ vector = apicd->prev_vector; - if (!vector) + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) goto unlock; /* diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 28a7d3f231..567dbd2fe4 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -231,16 +231,12 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .disable_esr = 0, - .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = x2apic_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 409815a406..12d4c35547 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -129,16 +129,6 @@ u32 x2apic_get_apic_id(u32 id) return id; } -u32 x2apic_set_apic_id(u32 id) -{ - return id; -} - -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", @@ -150,12 +140,10 @@ static struct apic apic_x2apic_phys __ro_after_init = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f1766b18dc..7fef504ca5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -241,54 +241,20 @@ static void __init uv_tsc_check_sync(void) is_uv(UV3) ? sname.s3.field : \ undef) -/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ - -#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ -#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) - -static void set_x2apic_bits(void) -{ - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int sid_shift; - - cpuid(0, &eax, &ebx, &ecx, &edx); - if (eax < 0xb) { - pr_info("UV: CPU does not have CPUID.11\n"); - return; - } - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { - pr_info("UV: CPUID.11 not implemented\n"); - return; - } - - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - uv_cpuid.apicid_shift = 0; - uv_cpuid.apicid_mask = (~(-1 << sid_shift)); - uv_cpuid.socketid_shift = sid_shift; -} - static void __init early_get_apic_socketid_shift(void) { + unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN); + if (is_uv2_hub() || is_uv3_hub()) uvh_apicid.v = uv_early_read_mmr(UVH_APICID); - set_x2apic_bits(); + if (sid_shift) { + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; + } else { + pr_info("UV: CPU does not have valid CPUID.11\n"); + } pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); @@ -779,21 +745,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static u32 set_apic_id(u32 id) -{ - return id; -} - -static unsigned int uv_read_apic_id(void) -{ - return x2apic_get_apic_id(apic_read(APIC_ID)); -} - -static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return uv_read_apic_id() >> index_msb; -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -810,11 +761,9 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = uv_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 76a5ced278..b37ab10957 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1055,35 +1055,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) return APM_SUCCESS; } -#if 0 -static int apm_get_battery_status(u_short which, u_short *status, - u_short *bat, u_short *life, u_short *nbat) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - - if (apm_info.connection_version < 0x0102) { - /* pretend we only have one battery. */ - if (which != 1) - return APM_BAD_DEVICE; - *nbat = 1; - return apm_get_power_status(status, bat, life); - } - - if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; - *life = edx; - *nbat = esi; - return APM_SUCCESS; -} -#endif - /** * apm_engage_power_management - enable PM on a device * @device: identity of device diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 6913b372cc..a98020bf31 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,7 +109,7 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 64ad2ddea1..e92ff0c11d 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -24,6 +24,8 @@ static int __initdata_or_module debug_callthunks; +#define MAX_PATCH_LEN (255-1) + #define prdbg(fmt, args...) \ do { \ if (debug_callthunks) \ @@ -42,8 +44,8 @@ DEFINE_PER_CPU(u64, __x86_call_count); DEFINE_PER_CPU(u64, __x86_ret_count); DEFINE_PER_CPU(u64, __x86_stuffs_count); DEFINE_PER_CPU(u64, __x86_ctxsw_count); -EXPORT_SYMBOL_GPL(__x86_ctxsw_count); -EXPORT_SYMBOL_GPL(__x86_call_count); +EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count); #endif extern s32 __call_sites[], __call_sites_end[]; @@ -179,10 +181,15 @@ static const u8 nops[] = { static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; + u8 insn_buff[MAX_PATCH_LEN]; u8 *pad = dest - tsize; + memcpy(insn_buff, skl_call_thunk_template, tsize); + apply_relocation(insn_buff, tsize, pad, + skl_call_thunk_template, tsize); + /* Already patched? */ - if (!bcmp(pad, skl_call_thunk_template, tsize)) + if (!bcmp(pad, insn_buff, tsize)) return pad; /* Ensure there are nops */ @@ -192,9 +199,9 @@ static void *patch_dest(void *dest, bool direct) } if (direct) - memcpy(pad, skl_call_thunk_template, tsize); + memcpy(pad, insn_buff, tsize); else - text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + text_poke_copy_locked(pad, insn_buff, tsize, true); return pad; } @@ -290,20 +297,27 @@ void *callthunks_translate_call_dest(void *dest) static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; unsigned long dest; + u8 *pad; dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); if (!thunks_initialized || skip_addr((void *)dest)) return false; - return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); + pad = (void *)(dest - tmpl_size); + + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, tmpl_size, pad, + skl_call_thunk_template, tmpl_size); + + return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; if (!thunks_initialized) return 0; @@ -312,7 +326,11 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) if (func && is_callthunk(func)) return 0; - memcpy(*pprog, tmpl, tmpl_size); + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, tmpl_size, ip, + skl_call_thunk_template, tmpl_size); + + memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; return tmpl_size; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93eabf5440..eb4dbcdf41 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -obj-y := cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o +obj-y += topology_common.o topology_ext.o topology_amd.o obj-y += common.o obj-y += rdrand.o obj-y += match.o @@ -25,14 +26,16 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += capflags.o powerflags.o -obj-$(CONFIG_PROC_FS) += proc.o -obj-y += capflags.o powerflags.o +obj-$(CONFIG_X86_LOCAL_APIC) += topology.o -obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +obj-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o tsx.o -obj-$(CONFIG_PM) += intel_epb.o +obj-y += intel.o intel_pconfig.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad6..2c5b51aad9 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0838ea579e..307302af0a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/sev.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -27,13 +28,6 @@ #include "cpu.h" -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX - * Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -300,97 +294,6 @@ static int nearby_node(int apicid) } #endif -/* - * Fix up topo::core_id for pre-F17h systems to be in the - * [0 .. cores_per_node - 1] range. Not really needed but - * kept so as not to break existing setups. - */ -static void legacy_fixup_core_id(struct cpuinfo_x86 *c) -{ - u32 cus_per_node; - - if (c->x86 >= 0x17) - return; - - cus_per_node = c->x86_max_cores / nodes_per_socket; - c->topo.core_id %= cus_per_node; -} - -/* - * Fixup core topology information for - * (1) AMD multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) AMD processors supporting compute units - */ -static void amd_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - if (c->x86 == 0x15) - c->topo.cu_id = ebx & 0xff; - - if (c->x86 >= 0x17) { - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - } - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - cacheinfo_amd_init_llc_id(c); - - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) { - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - legacy_fixup_core_id(c); - } -} - -/* - * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void amd_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* use socket ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - -u32 amd_get_nodes_per_socket(void) -{ - return nodes_per_socket; -} -EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -442,29 +345,25 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_amd_mc(struct cpuinfo_x86 *c) +static void bsp_determine_snp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined for. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + c->x86 >= 0x19 && snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } } - - c->x86_coreid_bits = bits; #endif } @@ -500,18 +399,6 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && c->x86 >= 0x15 && c->x86 <= 0x17) { @@ -586,6 +473,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; } + bsp_determine_snp(c); return; warn: @@ -604,8 +492,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * SME feature (set in scattered.c). * If the kernel has not enabled SME via any means then * don't advertise the SME feature. - * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV and SEV_ES feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise SEV and + * any additional functionality based on it. * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -640,16 +528,14 @@ clear_all: clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); } } static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; - early_init_amd_mc(c); - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -715,23 +601,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); @@ -940,7 +809,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) void init_spectral_chicken(struct cpuinfo_x86 *c) { -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY u64 value; /* @@ -968,7 +837,6 @@ static void init_amd_zen_common(void) static void init_amd_zen1(struct cpuinfo_x86 *c) { - init_amd_zen_common(); fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ @@ -1022,7 +890,6 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen2(struct cpuinfo_x86 *c) { - init_amd_zen_common(); init_spectral_chicken(c); fix_erratum_1386(c); zen2_zenbleed_check(c); @@ -1030,8 +897,6 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) static void init_amd_zen3(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { /* * Zen3 (Fam19 model < 0x10) parts are not susceptible to @@ -1045,15 +910,12 @@ static void init_amd_zen3(struct cpuinfo_x86 *c) static void init_amd_zen4(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } static void init_amd_zen5(struct cpuinfo_x86 *c) { - init_amd_zen_common(); } static void init_amd(struct cpuinfo_x86 *c) @@ -1075,9 +937,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_FSRM)) set_cpu_cap(c, X86_FEATURE_FSRS); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); @@ -1093,6 +952,13 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x16: init_amd_jg(c); break; } + /* + * Save up on some future enablement work and do common Zen + * settings. + */ + if (c->x86 >= 0x17) + init_amd_zen_common(); + if (boot_cpu_has(X86_FEATURE_ZEN1)) init_amd_zen1(c); else if (boot_cpu_has(X86_FEATURE_ZEN2)) @@ -1113,8 +979,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - amd_detect_cmp(c); - amd_get_topology(c); srat_detect_node(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ceee008b..ab18185894 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -56,7 +56,7 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); -EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); +EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); @@ -731,7 +731,7 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; #else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; @@ -1042,10 +1042,10 @@ static void __init retbleed_select_mitigation(void) return; case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { - pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; @@ -1054,24 +1054,24 @@ static void __init retbleed_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else - pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } @@ -1081,9 +1081,10 @@ do_cmd_auto: case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } @@ -1162,7 +1163,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) @@ -1475,7 +1476,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_RETPOLINE)) { + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1498,7 +1499,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1529,7 +1530,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { - if (!IS_ENABLED(CONFIG_RETPOLINE)) { + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } @@ -1697,7 +1698,7 @@ static void __init spectre_v2_select_mitigation(void) break; } - if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && @@ -2591,7 +2592,7 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. @@ -2611,29 +2612,29 @@ static void __init srso_select_mitigation(void) else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; } } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); } break; case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; } } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; } @@ -3016,3 +3017,8 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } #endif + +void __warn_thunk(void) +{ + WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n"); +} diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c131c412db..392d09c936 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -301,7 +301,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + eax->split.num_cores_on_die = topology_num_cores_per_package(); if (assoc == 0xffff) @@ -595,7 +595,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = topology_die_id(smp_processor_id()); + node = topology_amd_node_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,7 +672,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) if (c->x86 < 0x17) { /* LLC is at the node level. */ - c->topo.llc_id = c->topo.die_id; + c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -1118,15 +1118,16 @@ static void cache_cpu_init(void) unsigned long flags; local_irq_save(flags); - cache_disable(); - if (memory_caching_control & CACHE_MTRR) + if (memory_caching_control & CACHE_MTRR) { + cache_disable(); mtrr_generic_set_state(); + cache_enable(); + } if (memory_caching_control & CACHE_PAT) pat_cpu_init(); - cache_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 345f7d905d..a3b55db35c 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif early_init_centaur(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 46603c6e40..1982007828 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,6 +61,7 @@ #include <asm/microcode.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> +#include <asm/fred.h> #include <asm/uv/uv.h> #include <asm/ia32.h> #include <asm/set_memory.h> @@ -70,11 +71,26 @@ #include "cpu.h" +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); +unsigned int __max_threads_per_core __ro_after_init = 1; +EXPORT_SYMBOL(__max_threads_per_core); + +unsigned int __max_dies_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__max_dies_per_package); + +unsigned int __max_logical_packages __ro_after_init = 1; +EXPORT_SYMBOL(__max_logical_packages); + +unsigned int __num_cores_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_cores_per_package); + +unsigned int __num_threads_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_threads_per_package); static struct ppin_info { int feature; @@ -382,9 +398,8 @@ out: } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -790,19 +805,6 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } -void detect_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - c->x86_max_cores = 1; - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return; - - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - c->x86_max_cores = (eax >> 26) + 1; -} - void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -864,51 +866,6 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -int detect_ht_early(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - - if (!cpu_has(c, X86_FEATURE_HT)) - return -1; - - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return -1; - - if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return -1; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - smp_num_siblings = (ebx & 0xff0000) >> 16; - if (smp_num_siblings == 1) - pr_info_once("CPU0: Hyper-Threading is disabled\n"); -#endif - return 0; -} - -void detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - int index_msb, core_bits; - - if (detect_ht_early(c) < 0) - return; - - index_msb = get_count_order(smp_num_siblings); - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif -} - static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -1096,18 +1053,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - bool vp_bits_from_cpuid = true; if (!cpu_has(c, X86_FEATURE_CPUID) || - (c->extended_cpuid_level < 0x80000008)) - vp_bits_from_cpuid = false; - - if (vp_bits_from_cpuid) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } else { + (c->extended_cpuid_level < 0x80000008)) { if (IS_ENABLED(CONFIG_X86_64)) { c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1121,7 +1069,17 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; } + } else { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } + c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_alignment = c->x86_clflush_size; } @@ -1385,8 +1343,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature * flag and protect from vendor-specific bugs via the whitelist. + * + * Don't use AutoIBRS when SNP is enabled because it degrades host + * userspace indirect branch performance. */ - if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || + (cpu_has(c, X86_FEATURE_AUTOIBRS) && + !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) @@ -1627,11 +1590,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); cpu_parse_early_param(); + cpu_init_topology(c); + if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1643,6 +1609,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } else { setup_clear_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); + cpu_init_topology(c); } setup_force_cpu_cap(X86_FEATURE_ALWAYS); @@ -1783,23 +1750,11 @@ static void generic_identify(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); - if (c->cpuid_level >= 0x00000001) { - c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_32 -# ifdef CONFIG_SMP - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -# else - c->topo.apicid = c->topo.initial_apicid; -# endif -#endif - c->topo.pkg_id = c->topo.initial_apicid; - } - get_model_name(c); /* Default name */ /* @@ -1821,29 +1776,6 @@ static void generic_identify(struct cpuinfo_x86 *c) } /* - * Validate that ACPI/mptables have the same information about the - * effective APIC id and update the package map. - */ -static void validate_apic_and_package_id(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int cpu = smp_processor_id(); - u32 apicid; - - apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid != c->topo.apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->topo.initial_apicid); - } - BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); - BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); -#else - c->topo.logical_pkg_id = 0; -#endif -} - -/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1856,11 +1788,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->topo.cu_id = 0xff; - c->topo.llc_id = BAD_APICID; - c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1879,17 +1806,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); + cpu_parse_topology(c); + if (this_cpu->c_identify) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ apply_forced_caps(c); -#ifdef CONFIG_X86_64 - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -#endif - - /* * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and * Hygon will clear it in ->c_init() below. @@ -1943,10 +1867,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } -#ifdef CONFIG_X86_64 - detect_ht(c); -#endif - x86_init_rdrand(c); setup_pku(c); setup_cet(c); @@ -1978,8 +1898,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); - select_idle_routine(c); - #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif @@ -2038,7 +1956,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); if (boot_cpu_has_bug(X86_BUG_GDS)) @@ -2090,6 +2007,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); +EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, @@ -2107,10 +2025,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { @@ -2144,6 +2060,23 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR @@ -2247,8 +2180,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2257,8 +2191,10 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + load_current_idt(); } /* @@ -2381,13 +2317,17 @@ void arch_smt_update(void) void __init arch_cpu_finalize_init(void) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + identify_boot_cpu(); + select_idle_routine(); + /* * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); + cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); @@ -2417,6 +2357,13 @@ void __init arch_cpu_finalize_init(void) fpu__init_system(); fpu__init_cpu(); + /* + * Ensure that access to the per CPU representation has the initial + * boot CPU configuration. + */ + *c = boot_cpu_data; + c->initialized = true; + alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 885281ae79..1beccefbaf 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -2,6 +2,11 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H +#include <asm/cpu.h> +#include <asm/topology.h> + +#include "topology.h" + /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -56,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); @@ -71,14 +78,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); -extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); -extern int detect_extended_topology_early(struct cpuinfo_x86 *c); -extern int detect_extended_topology(struct cpuinfo_x86 *c); -extern int detect_ht_early(struct cpuinfo_x86 *c); -extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); @@ -96,4 +98,5 @@ static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_EIBRS_LFENCE; } + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 6fb6d8a57c..946813d816 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -82,6 +82,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, {} }; diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 0c179d684b..3baf3e4358 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -5,6 +5,8 @@ #include <asm/apic.h> #include <asm/processor.h> +#include "cpu.h" + static int cpu_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; @@ -24,9 +26,12 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); - seq_printf(m, "max_cores: %u\n", c->x86_max_cores); - seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); - seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); + seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg()); + seq_printf(m, "num_threads: %u\n", __num_threads_per_package); + seq_printf(m, "num_cores: %u\n", __num_cores_per_package); + seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package); + seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core); return 0; } @@ -42,12 +47,48 @@ static const struct file_operations dfs_cpu_ops = { .release = single_release, }; +static int dom_debug_show(struct seq_file *m, void *p) +{ + static const char *domain_names[TOPO_MAX_DOMAIN] = { + [TOPO_SMT_DOMAIN] = "Thread", + [TOPO_CORE_DOMAIN] = "Core", + [TOPO_MODULE_DOMAIN] = "Module", + [TOPO_TILE_DOMAIN] = "Tile", + [TOPO_DIE_DOMAIN] = "Die", + [TOPO_DIEGRP_DOMAIN] = "DieGrp", + [TOPO_PKG_DOMAIN] = "Package", + }; + unsigned int dom, nthreads = 1; + + for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) { + nthreads *= x86_topo_system.dom_size[dom]; + seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n", + domain_names[dom], x86_topo_system.dom_shifts[dom], + x86_topo_system.dom_size[dom], nthreads); + } + return 0; +} + +static int dom_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, dom_debug_show, inode->i_private); +} + +static const struct file_operations dfs_dom_ops = { + .open = dom_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static __init int cpu_init_debugfs(void) { struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); unsigned long id; char name[24]; + debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops); + dir = debugfs_create_dir("cpus", base); for_each_possible_cpu(id) { sprintf(name, "%lu", id); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 03851240c3..1640ae7654 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); if (ept & VMX_EPT_1GB_PAGE_BIT) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + if (ept & VMX_EPT_PAGE_WALK_5_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL); /* Synthetic APIC features that are aggregates of multiple features. */ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index f0cd95502f..c5191b06f9 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -18,14 +18,6 @@ #include "cpu.h" -#define APICID_SOCKET_ID_BIT 6 - -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - #ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in @@ -49,80 +41,6 @@ static int nearby_node(int apicid) } #endif -static void hygon_get_topology_early(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; -} - -/* - * Fixup core topology information for - * (1) Hygon multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) Hygon processors supporting compute units - */ -static void hygon_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - /* - * Socket ID is ApicId[6] for the processors with model <= 0x3 - * when running on host. - */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) - c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - - cacheinfo_hygon_init_llc_id(c); - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) - set_cpu_cap(c, X86_FEATURE_AMD_DCM); -} - -/* - * On Hygon setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void hygon_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned int bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* Use package ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -173,32 +91,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_hygon_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_hygon(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -212,18 +104,6 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { /* @@ -242,8 +122,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) { u32 dummy; - early_init_hygon_mc(c); - set_cpu_cap(c, X86_FEATURE_K8); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -284,8 +162,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) * we can set it unconditionally. */ set_cpu_cap(c, X86_FEATURE_VMMCALL); - - hygon_get_topology_early(c); } static void init_hygon(struct cpuinfo_x86 *c) @@ -302,9 +178,6 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* * XXX someone from Hygon needs to confirm this DTRT * @@ -316,8 +189,6 @@ static void init_hygon(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - hygon_detect_cmp(c); - hygon_get_topology(c); srat_detect_node(c); init_hygon_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 40dec9b56f..93efd9edc7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -268,19 +268,26 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + return; + + /* + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. + */ + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } - } - if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); @@ -401,13 +408,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) check_memory_type_self_snoop_errata(c); /* - * Get the number of SMT siblings early from the extended topology - * leaf, if available. Otherwise try the legacy SMT detection. - */ - if (detect_extended_topology_early(c) < 0) - detect_ht_early(c); - - /* * Adjust the number of physical bits early because it affects the * valid bits of the MTRR mask registers. */ @@ -610,24 +610,6 @@ static void init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); - /* - * Detect the extended topology information if available. This - * will reinitialise the initial_apicid which will be used - * in init_intel_cacheinfo() - */ - detect_extended_topology(c); - - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c index 0771a905b2..5be2b17902 100644 --- a/arch/x86/kernel/cpu/intel_pconfig.c +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -7,6 +7,8 @@ * Author: * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> */ +#include <linux/bug.h> +#include <linux/limits.h> #include <asm/cpufeature.h> #include <asm/intel_pconfig.h> diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index ad6776081e..ae71b8ef90 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,9 +39,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; - m->vendor | m->family | m->model | m->steppings | m->feature; - m++) { + for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 2b46eb0fdf..9a0133ef7e 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1231,7 +1231,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_die_id(cpu)); + nb = node_to_amd_nb(topology_amd_node_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1335,7 +1335,7 @@ static void threshold_remove_bank(struct threshold_bank *bank) * The last CPU on this node using the shared bank is going * away, remove that bank now. */ - nb = node_to_amd_nb(topology_die_id(smp_processor_id())); + nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); nb->bank4 = NULL; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c17f9403cf..84d41be6d0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,6 +46,7 @@ #include <linux/hardirq.h> #include <linux/kexec.h> +#include <asm/fred.h> #include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> @@ -2166,6 +2167,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) @@ -2431,7 +2457,7 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct bus_type mce_subsys = { +static const struct bus_type mce_subsys = { .name = "machinecheck", .dev_name = "machinecheck", }; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 72f0695c3d..94953d7494 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -430,11 +430,9 @@ static void trigger_thr_int(void *info) static u32 get_nbc_for_node(int node_id) { - struct cpuinfo_x86 *c = &boot_cpu_data; u32 cores_per_node; - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - + cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg(); return cores_per_node * node_id; } @@ -543,8 +541,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(topology_die_id(cpu)); - cpu = get_nbc_for_node(topology_die_id(cpu)); + toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu)); + cpu = get_nbc_for_node(topology_amd_node_id(cpu)); } cpus_read_lock(); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 857e608af6..5f0414452b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -641,7 +641,7 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) { u64 llc_size = c->x86_cache_size * 1024ULL; - do_div(llc_size, c->x86_max_cores); + do_div(llc_size, topology_num_cores_per_package()); llc_size_per_core = (unsigned int)llc_size; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06..e0fd57a8ba 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -45,70 +45,70 @@ bool hyperv_paravisor_present __ro_after_init; EXPORT_SYMBOL_GPL(hyperv_paravisor_present); #if IS_ENABLED(CONFIG_HYPERV) -static inline unsigned int hv_get_nested_reg(unsigned int reg) +static inline unsigned int hv_get_nested_msr(unsigned int reg) { - if (hv_is_sint_reg(reg)) - return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + if (hv_is_sint_msr(reg)) + return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0; switch (reg) { - case HV_REGISTER_SIMP: - return HV_REGISTER_NESTED_SIMP; - case HV_REGISTER_SIEFP: - return HV_REGISTER_NESTED_SIEFP; - case HV_REGISTER_SVERSION: - return HV_REGISTER_NESTED_SVERSION; - case HV_REGISTER_SCONTROL: - return HV_REGISTER_NESTED_SCONTROL; - case HV_REGISTER_EOM: - return HV_REGISTER_NESTED_EOM; + case HV_X64_MSR_SIMP: + return HV_X64_MSR_NESTED_SIMP; + case HV_X64_MSR_SIEFP: + return HV_X64_MSR_NESTED_SIEFP; + case HV_X64_MSR_SVERSION: + return HV_X64_MSR_NESTED_SVERSION; + case HV_X64_MSR_SCONTROL: + return HV_X64_MSR_NESTED_SCONTROL; + case HV_X64_MSR_EOM: + return HV_X64_MSR_NESTED_EOM; default: return reg; } } -u64 hv_get_non_nested_register(unsigned int reg) +u64 hv_get_non_nested_msr(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; } -EXPORT_SYMBOL_GPL(hv_get_non_nested_register); +EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); -void hv_set_non_nested_register(unsigned int reg, u64 value) +void hv_set_non_nested_msr(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_reg(reg)) + if (hv_is_sint_msr(reg)) wrmsrl(reg, value | 1 << 20); } else { wrmsrl(reg, value); } } -EXPORT_SYMBOL_GPL(hv_set_non_nested_register); +EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); -u64 hv_get_register(unsigned int reg) +u64 hv_get_msr(unsigned int reg) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - return hv_get_non_nested_register(reg); + return hv_get_non_nested_msr(reg); } -EXPORT_SYMBOL_GPL(hv_get_register); +EXPORT_SYMBOL_GPL(hv_get_msr); -void hv_set_register(unsigned int reg, u64 value) +void hv_set_msr(unsigned int reg, u64 value) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - hv_set_non_nested_register(reg, value); + hv_set_non_nested_msr(reg, value); } -EXPORT_SYMBOL_GPL(hv_set_register); +EXPORT_SYMBOL_GPL(hv_set_msr); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); @@ -209,7 +209,9 @@ static void hv_machine_shutdown(void) if (kexec_in_progress) hyperv_cleanup(); } +#endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_CRASH_DUMP static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) @@ -221,7 +223,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) /* Disable the hypercall page when there is only 1 active CPU. */ hyperv_cleanup(); } -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_DUMP */ #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -350,13 +352,24 @@ static void __init reduced_hw_init(void) x86_init.irqs.pre_vector_init = x86_init_noop; } +int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) +{ + unsigned int hv_max_functions; + + hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + if (hv_max_functions < HYPERV_CPUID_VERSION) { + pr_err("%s: Could not detect Hyper-V version\n", __func__); + return -ENODEV; + } + + cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx); + + return 0; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; - int hv_host_info_eax; - int hv_host_info_ebx; - int hv_host_info_ecx; - int hv_host_info_edx; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -407,21 +420,6 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running on a nested hypervisor\n"); } - /* - * Extract host information. - */ - if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { - hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); - hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); - hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); - hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); - - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", - hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, - hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, - hv_host_info_ecx, hv_host_info_edx >> 24); - } - if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; @@ -454,7 +452,7 @@ static void __init ms_hyperv_init_platform(void) /* To be supported: more work is required. */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; - /* HV_REGISTER_CRASH_CTL is unsupported. */ + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; /* Don't trust Hyper-V's TLB-flushing hypercalls. */ @@ -495,10 +493,14 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif -#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) +#if IS_ENABLED(CONFIG_HYPERV) +#if defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; +#endif +#if defined(CONFIG_CRASH_DUMP) machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif +#endif if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* * Writing to synthetic MSR 0x40000118 updates/changes the @@ -539,19 +541,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP @@ -643,6 +644,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, + .init.guest_late_init = ms_hyperv_late_init, #ifdef CONFIG_AMD_MEM_ENCRYPT .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d3524778a5..7b29ebda02 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 26a427fa84..eeac00d209 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -6,6 +6,7 @@ * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ +#include <linux/printk.h> #include <asm/processor.h> #include <asm/archrandom.h> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d04371e851..83e4034158 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/cacheinfo.h> @@ -25,8 +26,15 @@ #include <asm/resctrl.h> #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; @@ -510,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct rdt_domain *d; int err; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -543,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail(&d->list, add_pos); + list_add_tail_rcu(&d->list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); domain_free(hw_dom); } } @@ -558,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -568,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { resctrl_offline_domain(r, d); - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); /* * rdt_domain "d" is going to be freed below, so clear @@ -580,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) return; } - - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(r, d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); - } - } } static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - state->default_closid = 0; - state->default_rmid = 0; - state->cur_closid = 0; - state->cur_rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + state->default_closid = RESCTRL_RESERVED_CLOSID; + state->default_rmid = RESCTRL_RESERVED_RMID; + state->cur_closid = RESCTRL_RESERVED_CLOSID; + state->cur_rmid = RESCTRL_RESERVED_RMID; + wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID, + RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); + resctrl_online_cpu(cpu); return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) -{ - struct rdtgroup *rdtgrp; struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -966,7 +954,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; @@ -990,8 +979,14 @@ late_initcall(resctrl_late_init); static void __exit resctrl_exit(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); + + if (r->mon_capable) + rdt_put_mon_l3_config(); } __exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index beccb0e87b..7997b47743 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -19,6 +19,8 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/tick.h> + #include "internal.h" /* @@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_domain *d; u32 idx; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } @@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) @@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first) { + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* - * setup the parameters to send to the IPI to read the data. + * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; @@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->d = d; rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 52e7e7deee..1a8687f807 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -7,6 +7,9 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> #include <linux/jump_label.h> +#include <linux/tick.h> + +#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL @@ -53,6 +56,47 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu, hk_cpu; + + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) + return cpu; + + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + + if (hk_cpu < nr_cpu_ids) + cpu = hk_cpu; + + return cpu; +} + struct rdt_fs_context { struct kernfs_fs_context kfc; bool enable_cdpl2; @@ -68,9 +112,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) return container_of(kfc, struct rdt_fs_context, kfc); } -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - /** * struct mon_evt - Entry in the event list of a resource * @evtid: event id @@ -111,12 +152,12 @@ struct rmid_read { bool first; int err; u64 val; + void *arch_mon_ctx; }; -extern bool rdt_alloc_capable; -extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; +extern bool resctrl_mounted; enum rdt_group_type { RDTCTRL_GROUP = 0, @@ -424,8 +465,6 @@ extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - extern struct dentry *debugfs_resctrl; enum resctrl_res_level { @@ -541,9 +580,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); -int alloc_rmid(void); -void free_rmid(u32 rmid); +int alloc_rmid(u32 closid); +void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); @@ -551,17 +591,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); void rdt_staged_configs_clear(void); +bool closid_allocated(unsigned int closid); +int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 3a6c069614..2ce5f4913c 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,7 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#include <linux/cpu.h> #include <linux/module.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -24,7 +25,20 @@ #include "internal.h" +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -38,6 +52,13 @@ struct rmid_entry { static LIST_HEAD(rmid_free_lru); /* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that @@ -136,12 +157,29 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } @@ -190,7 +228,8 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; @@ -230,7 +269,8 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) } int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -238,6 +278,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u64 msr_val, chunks; int ret; + resctrl_arch_rmid_read_context_check(); + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; @@ -260,6 +302,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and @@ -269,11 +322,20 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, void __check_limbo(struct rdt_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -281,53 +343,125 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(nrmid); - - if (resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); } - crmid = nrmid + 1; + cur_idx = idx + 1; + } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); +} + +bool has_busy_rmid(struct rdt_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; } + + return ERR_PTR(-ENOSPC); } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; } /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } @@ -335,47 +469,51 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu, err; - u64 val = 0; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } - /* * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) - rmid_limbo_count++; - else - list_add_tail(&entry->list, &rmid_free_lru); + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,33 +521,36 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -421,6 +562,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,9 +571,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *m = &rr->d->mbm_local[idx]; u64 cur_bw, bytes, cur_bytes; cur_bytes = rr->val; @@ -444,7 +587,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) @@ -456,7 +599,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -467,7 +610,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -519,9 +663,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) struct mbm_state *pmbm_data, *cmbm_data; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; + u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; - u32 cur_bw, user_bw; if (!is_mbm_local_enabled()) return; @@ -530,7 +674,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + idx = resctrl_arch_rmid_idx_encode(closid, rmid); + pmbm_data = &dom_mbm->mbm_local[idx]; dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { @@ -578,7 +723,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, u32 rmid) { struct rmid_read rr; @@ -593,12 +739,28 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* * Call the MBA software controller only for the @@ -606,7 +768,9 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) * the software controller explicitly. */ if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } } @@ -617,106 +781,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); - struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); struct list_head *head; struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int i, nr_rmids; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + closid_num_dirty_rmid = tmp; + } - for (i = 0; i < nr_rmids; i++) { + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); } static struct mon_evt llc_occupancy_event = { @@ -814,6 +1065,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8f559eeae0..884b88e251 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); if (ret) goto err_cpus_list; @@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; @@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * anymore when this group would be used for pseudo-locking. This * is safe to call on platforms not capable of monitoring. */ - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); ret = 0; goto out; @@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) { int ret; - if (rdt_mon_capable) { - ret = alloc_rmid(); + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; @@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ret = rdtgroup_locksetup_user_restore(rdtgrp); if (ret) { - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2b69e560b0..011e17efb1 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -35,6 +35,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -42,6 +46,9 @@ LIST_HEAD(rdt_all_groups); /* list of entries for the schemata file */ LIST_HEAD(resctrl_schema_all); +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -102,7 +109,7 @@ void rdt_staged_configs_clear(void) * * Using a global CLOSID across all resources has some advantages and * some drawbacks: - * + We can simply set "current->closid" to assign a task to a resource + * + We can simply set current's closid to assign a task to a resource * group. * + Context switch code can avoid extra memory references deciding which * CLOSID to load into the PQR_ASSOC MSR @@ -111,7 +118,7 @@ void rdt_staged_configs_clear(void) * - Our choices on how to configure each resource become progressively more * limited as the number of resources grows. */ -static int closid_free_map; +static unsigned long closid_free_map; static int closid_free_map_len; int closids_supported(void) @@ -130,26 +137,39 @@ static void closid_init(void) closid_free_map = BIT_MASK(rdt_min_closid) - 1; - /* CLOSID 0 is always reserved for the default group */ - closid_free_map &= ~1; + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) { - u32 closid = ffs(closid_free_map); + int cleanest_closid; + u32 closid; - if (closid == 0) - return -ENOSPC; - closid--; - closid_free_map &= ~(1 << closid); + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = ffs(closid_free_map); + if (closid == 0) + return -ENOSPC; + closid--; + } + __clear_bit(closid, &closid_free_map); return closid; } void closid_free(int closid) { - closid_free_map |= 1 << closid; + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, &closid_free_map); } /** @@ -159,9 +179,11 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool closid_allocated(unsigned int closid) +bool closid_allocated(unsigned int closid) { - return (closid_free_map & (1 << closid)) == 0; + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, &closid_free_map); } /** @@ -559,14 +581,26 @@ static void update_task_closid_rmid(struct task_struct *t) _update_task_closid_rmid(t); } +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { /* If the task is already in rdtgrp, no need to move the task. */ - if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && - tsk->rmid == rdtgrp->mon.rmid) || - (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && - tsk->closid == rdtgrp->mon.parent->closid)) + if (task_in_rdtgroup(tsk, rdtgrp)) return 0; /* @@ -577,19 +611,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * For monitor groups, can move the tasks only from * their parent CTRL group. */ - - if (rdtgrp->type == RDTCTRL_GROUP) { - WRITE_ONCE(tsk->closid, rdtgrp->closid); - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. @@ -611,14 +645,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); } /** @@ -853,7 +888,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, mutex_lock(&rdtgroup_mutex); /* Return empty if resctrl has not been mounted. */ - if (!static_branch_unlikely(&rdt_enable_key)) { + if (!resctrl_mounted) { seq_puts(s, "res:\nmon:\n"); goto unlock; } @@ -869,7 +904,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, rdtg->mode != RDT_MODE_EXCLUSIVE) continue; - if (rdtg->closid != tsk->closid) + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) continue; seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", @@ -877,7 +912,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, seq_puts(s, "mon:"); list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, mon.crdtgrp_list) { - if (tsk->rmid != crg->mon.rmid) + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) continue; seq_printf(s, "%s", crg->kn->name); break; @@ -982,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { @@ -1042,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1297,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) @@ -1561,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid struct rdt_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->domains, list) { @@ -1577,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1614,11 +1657,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; /* * Read the current config value first. If both are the same then @@ -1627,7 +1669,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1650,9 +1692,6 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) @@ -1661,7 +1700,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; - int ret = 0; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); next: if (!tok || tok[0] == '\0') @@ -1690,9 +1731,7 @@ next: list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + mbm_config_write_domain(r, d, evtid, val); goto next; } } @@ -1711,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1720,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1735,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1744,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -2220,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2419,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2436,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2586,7 +2635,7 @@ static int rdt_get_tree(struct fs_context *fc) /* * resctrl file system can only be mounted once. */ - if (static_branch_unlikely(&rdt_enable_key)) { + if (resctrl_mounted) { ret = -EBUSY; goto out; } @@ -2607,7 +2656,7 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) flags |= RFTYPE_MON; ret = rdtgroup_add_files(rdtgroup_default.kn, flags); @@ -2620,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_schemata_free; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = mongroup_create_dir(rdtgroup_default.kn, &rdtgroup_default, "mon_groups", &kn_mongrp); @@ -2642,18 +2691,19 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_psl; - if (rdt_alloc_capable) - static_branch_enable_cpuslocked(&rdt_alloc_enable_key); - if (rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_mon_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -2661,10 +2711,10 @@ static int rdt_get_tree(struct fs_context *fc) out_psl: rdt_pseudo_lock_release(); out_mondata: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); @@ -2767,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_domain *d; int i; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -2812,8 +2865,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - WRITE_ONCE(t->closid, to->closid); - WRITE_ONCE(t->rmid, to->mon.rmid); + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); /* * Order the closid/rmid stores above before the loads @@ -2844,7 +2897,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->mon.rmid); + free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); if (atomic_read(&sentry->waitcount) != 0) @@ -2884,7 +2937,7 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); @@ -2919,9 +2972,11 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - static_branch_disable_cpuslocked(&rdt_alloc_enable_key); - static_branch_disable_cpuslocked(&rdt_mon_enable_key); - static_branch_disable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); @@ -3049,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_domain *dom; int ret; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(dom, &r->domains, list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) @@ -3295,6 +3353,36 @@ out: return ret; } +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3355,7 +3443,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) files |= RFTYPE_MON; } else { files = RFTYPE_BASE | RFTYPE_MON; @@ -3367,29 +3455,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rdt_mon_capable) { - ret = alloc_rmid(); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - goto out_destroy; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_idfree; - } - } - kernfs_activate(kn); - /* * The caller unlocks the parent_kn upon success. */ return 0; -out_idfree: - free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); @@ -3403,7 +3473,6 @@ out_unlock: static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); - free_rmid(rgrp->mon.rmid); rdtgroup_remove(rgrp); } @@ -3425,12 +3494,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, prgrp = rdtgrp->mon.parent; rdtgrp->closid = prgrp->closid; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + /* * Add the rdtgrp to the list of rdtgrps the parent * ctrl_mon group has to track. */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); +out_unlock: rdtgroup_kn_unlock(parent_kn); return ret; } @@ -3461,13 +3539,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + ret = rdtgroup_init_alloc(rdtgrp); if (ret < 0) - goto out_id_free; + goto out_rmid_free; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { /* * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. @@ -3483,7 +3568,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, out_del_list: list_del(&rdtgrp->rdtgroup_list); -out_id_free: +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: closid_free(closid); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); @@ -3520,14 +3607,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * allocation is supported, add a control and monitoring * subdirectory */ - if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ - if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; @@ -3552,7 +3639,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* * Remove the rdtgrp from the parent ctrl_mon group's list @@ -3598,8 +3685,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); - free_rmid(rdtgrp->mon.rmid); rdtgroup_ctrl_remove(rdtgrp); @@ -3831,8 +3918,8 @@ static void __init rdtgroup_setup_default(void) { mutex_lock(&rdtgroup_mutex); - rdtgroup_default.closid = 0; - rdtgroup_default.mon.rmid = 0; + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; rdtgroup_default.type = RDTCTRL_GROUP; INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); @@ -3850,24 +3937,24 @@ static void domain_destroy_mon_state(struct rdt_domain *d) void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); if (!r->mon_capable) - return; + goto out_unlock; /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, d->id); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -3881,20 +3968,24 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } if (is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { bitmap_free(d->rmid_busy_llc); return -ENOMEM; @@ -3902,7 +3993,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) } if (is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); @@ -3915,34 +4006,97 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + goto out_unlock; + } if (!r->mon_capable) - return 0; + goto out_unlock; err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - /* If resctrl is mounted, add per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdtgroup *rdtgrp; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_domain_from_cpu(cpu, l3); + if (d) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 0ebca40dfd..af5aa2c754 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -50,6 +50,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index dc13670356..621a151ccf 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,167 +1,571 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only /* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. + * CPU/APIC topology + * + * The APIC IDs describe the system topology in multiple domain levels. + * The CPUID topology parser provides the information which part of the + * APIC ID is associated to the individual levels: + * + * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD] + * + * The root space contains the package (socket) IDs. + * + * Not enumerated levels consume 0 bits space, but conceptually they are + * always represented. If e.g. only CORE and THREAD levels are enumerated + * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE. + * + * If SMT is not supported, then the THREAD domain is still used. It then + * has the same physical ID as the CORE domain and is the only child of + * the core domain. + * + * This allows a unified view on the system independent of the enumerated + * domain levels without requiring any conditionals in the code. */ - +#define pr_fmt(fmt) "CPU topo: " fmt #include <linux/cpu.h> + +#include <xen/xen.h> + #include <asm/apic.h> -#include <asm/memtype.h> -#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/smp.h> #include "cpu.h" -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); -/* extended topology sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define DIE_TYPE 5 +/* Bitmap of physically present CPUs. */ +DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly; -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) +/* Used for CPU number allocation and parallel CPU bringup */ +u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, }; -unsigned int __max_die_per_package __read_mostly = 1; -EXPORT_SYMBOL(__max_die_per_package); +/* Bitmaps to mark registered APICs at each topology domain */ +static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init; -#ifdef CONFIG_SMP /* - * Check if given CPUID extended topology "leaf" is implemented + * Keep track of assigned, disabled and rejected CPUs. Present assigned + * with 1 as CPU #0 is reserved for the boot CPU. */ -static int check_extended_topology_leaf(int leaf) -{ - unsigned int eax, ebx, ecx, edx; +static struct { + unsigned int nr_assigned_cpus; + unsigned int nr_disabled_cpus; + unsigned int nr_rejected_cpus; + u32 boot_cpu_apic_id; + u32 real_bsp_apic_id; +} topo_info __ro_after_init = { + .nr_assigned_cpus = 1, + .boot_cpu_apic_id = BAD_APICID, + .real_bsp_apic_id = BAD_APICID, +}; - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); +#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC) - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return -1; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == (u64)cpuid_to_apicid[cpu]; +} - return 0; +#ifdef CONFIG_SMP +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + if (!(apicid & (__max_threads_per_core - 1))) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } +#endif + /* - * Return best CPUID Extended Topology Leaf supported + * Convert the APIC ID to a domain level ID by masking out the low bits + * below the domain level @dom. */ -static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom) { - if (c->cpuid_level >= 0x1f) { - if (check_extended_topology_leaf(0x1f) == 0) - return 0x1f; - } + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]); +} - if (c->cpuid_level >= 0xb) { - if (check_extended_topology_leaf(0xb) == 0) - return 0xb; - } +static int topo_lookup_cpuid(u32 apic_id) +{ + int i; - return -1; + /* CPU# to APICID mapping is persistent once it is established */ + for (i = 0; i < topo_info.nr_assigned_cpus; i++) { + if (cpuid_to_apicid[i] == apic_id) + return i; + } + return -ENODEV; } -#endif -int detect_extended_topology_early(struct cpuinfo_x86 *c) +static __init int topo_get_cpunr(u32 apic_id) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx; - int leaf; + int cpu = topo_lookup_cpuid(apic_id); - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + if (cpu >= 0) + return cpu; - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + return topo_info.nr_assigned_cpus++; +} - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - /* - * initial apic id, which also represents 32-bit extended x2apic id. - */ - c->topo.initial_apicid = edx; - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); +static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - return 0; + set_cpu_present(cpu, true); } -/* - * Check for extended topology enumeration cpuid leaf, and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +static __init bool check_for_real_bsp(u32 apic_id) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - unsigned int die_select_mask, die_level_siblings; - unsigned int pkg_mask_width; - bool die_level_present = false; - int leaf; - - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6; + u64 msr; /* - * Populate HT related information from sub-leaf level 0. + * There is no real good way to detect whether this a kdump() + * kernel, but except on the Voyager SMP monstrosity which is not + * longer supported, the real BSP APIC ID is the first one which is + * enumerated by firmware. That allows to detect whether the boot + * CPU is the real BSP. If it is not, then do not register the APIC + * because sending INIT to the real BSP would reset the whole + * system. + * + * The first APIC ID which is enumerated by firmware is detectable + * because the boot CPU APIC ID is registered before that without + * invoking this code. */ - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->topo.initial_apicid = edx; - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - while (true) { - cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + if (topo_info.real_bsp_apic_id != BAD_APICID) + return false; + /* + * Check whether the enumeration order is broken by evaluating the + * BSP bit in the APICBASE MSR. If the CPU does not have the + * APICBASE MSR then the BSP detection is not possible and the + * kernel must rely on the firmware enumeration order. + */ + if (has_apic_base) { + rdmsrl(MSR_IA32_APICBASE, msr); + is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); + } + + if (apic_id == topo_info.boot_cpu_apic_id) { /* - * Check for the Core type in the implemented sub leaves. + * If the boot CPU has the APIC BSP bit set then the + * firmware enumeration is agreeing. If the CPU does not + * have the APICBASE MSR then the only choice is to trust + * the enumeration order. */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = core_level_siblings; - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } - if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { - die_level_present = true; - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + if (is_bsp || !has_apic_base) { + topo_info.real_bsp_apic_id = apic_id; + return false; } + /* + * If the boot APIC is enumerated first, but the APICBASE + * MSR does not have the BSP bit set, then there is no way + * to discover the real BSP here. Assume a crash kernel and + * limit the number of CPUs to 1 as an INIT to the real BSP + * would reset the machine. + */ + pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id); + pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n"); + set_nr_cpu_ids(1); + goto fwbug; + } - if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) - pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - else - break; + pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n", + topo_info.boot_cpu_apic_id, apic_id); - sub_index++; + if (is_bsp) { + /* + * The boot CPU has the APIC BSP bit set. Use it and complain + * about the broken firmware enumeration. + */ + topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id; + goto fwbug; } - core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; - die_select_mask = (~(-1 << die_plus_mask_width)) >> - core_plus_mask_width; + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); + + topo_info.real_bsp_apic_id = apic_id; + return true; + +fwbug: + pr_warn(FW_BUG "APIC enumeration order not specification compliant\n"); + return false; +} + +static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, + unsigned long *map) +{ + unsigned int id, end, cnt = 0; + + /* Calculate the exclusive end */ + end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]); + + /* Unfortunately there is no bitmap_weight_range() */ + for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id)) + cnt++; + return cnt; +} + +static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + int cpu, dom; + + if (present) { + set_bit(apic_id, phys_cpu_present_map); + + /* + * Double registration is valid in case of the boot CPU + * APIC because that is registered before the enumeration + * of the APICs via firmware parsers or VM guest + * mechanisms. + */ + if (apic_id == topo_info.boot_cpu_apic_id) + cpu = 0; + else + cpu = topo_get_cpunr(apic_id); + + cpuid_to_apicid[cpu] = apic_id; + topo_set_cpuids(cpu, apic_id, acpi_id); + } else { + u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, - ht_mask_width) & core_select_mask; + /* + * Check for present APICs in the same package when running + * on bare metal. Allow the bogosity in a guest. + */ + if (hypervisor_is_type(X86_HYPER_NATIVE) && + topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { + pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", + apic_id); + topo_info.nr_rejected_cpus++; + return; + } - if (die_level_present) { - c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, - core_plus_mask_width) & die_select_mask; + topo_info.nr_disabled_cpus++; } - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); /* - * Reinit the apicid, now that we have extended initial_apicid. + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. */ - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) + set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); +} + +/** + * topology_register_apic - Register an APIC in early topology maps + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + * @present: True if the corresponding CPU is present + */ +void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + if (apic_id >= MAX_LOCAL_APIC) { + pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1); + topo_info.nr_rejected_cpus++; + return; + } + + if (check_for_real_bsp(apic_id)) { + topo_info.nr_rejected_cpus++; + return; + } + + /* CPU numbers exhausted? */ + if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) { + pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids); + topo_info.nr_rejected_cpus++; + return; + } + + topo_register_apic(apic_id, acpi_id, present); +} + +/** + * topology_register_boot_apic - Register the boot CPU APIC + * @apic_id: The APIC ID to set up + * + * Separate so CPU #0 can be assigned + */ +void __init topology_register_boot_apic(u32 apic_id) +{ + WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID); + + topo_info.boot_cpu_apic_id = apic_id; + topo_register_apic(apic_id, CPU_ACPIID_INVALID, true); +} - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - __max_die_per_package = (die_level_siblings / core_level_siblings); +/** + * topology_get_logical_id - Retrieve the logical ID at a given topology domain level + * @apicid: The APIC ID for which to lookup the logical ID + * @at_level: The topology domain level to use + * + * @apicid must be a full APIC ID, not the normalized variant. It's valid to have + * all bits below the domain level specified by @at_level to be clear. So both + * real APIC IDs and backshifted normalized APIC IDs work correctly. + * + * Returns: + * - >= 0: The requested logical ID + * - -ERANGE: @apicid is out of range + * - -ENODEV: @apicid is not registered + */ +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return -ERANGE; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return -ENODEV; + /* Get the number of set bits before @lvlid. */ + return bitmap_weight(apic_maps[at_level].map, lvlid); +} +EXPORT_SYMBOL_GPL(topology_get_logical_id); + +/** + * topology_unit_count - Retrieve the count of specified units at a given topology domain level + * @apicid: The APIC ID which specifies the search range + * @which_units: The domain level specifying the units to count + * @at_level: The domain level at which @which_units have to be counted + * + * This returns the number of possible units according to the enumerated + * information. + * + * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN) + * counts the number of possible cores in the package to which @apicid + * belongs. + * + * @at_level must obviously be greater than @which_level to produce useful + * results. If @at_level is equal to @which_units the result is + * unsurprisingly 1. If @at_level is less than @which_units the results + * is by definition undefined and the function returns 0. + */ +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return 0; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return 0; + if (which_units > at_level) + return 0; + if (which_units == at_level) + return 1; + return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +/** + * topology_hotplug_apic - Handle a physical hotplugged APIC after boot + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + */ +int topology_hotplug_apic(u32 apic_id, u32 acpi_id) +{ + int cpu; + + if (apic_id >= MAX_LOCAL_APIC) + return -EINVAL; + + /* Reject if the APIC ID was not registered during enumeration. */ + if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map)) + return -ENODEV; + + cpu = topo_lookup_cpuid(apic_id); + if (cpu < 0) + return -ENOSPC; + + set_bit(apic_id, phys_cpu_present_map); + topo_set_cpuids(cpu, apic_id, acpi_id); + cpu_mark_primary_thread(cpu, apic_id); + return cpu; +} + +/** + * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot + * @cpu: The CPU number for which the APIC ID is removed + */ +void topology_hotunplug_apic(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + if (apic_id == BAD_APICID) + return; + + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; + clear_bit(apic_id, phys_cpu_present_map); + set_cpu_present(cpu, false); +} #endif + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int max_possible_cpus __initdata = NR_CPUS; + +/** + * topology_apply_cmdline_limits_early - Apply topology command line limits early + * + * Ensure that command line limits are in effect before firmware parsing + * takes place. + */ +void __init topology_apply_cmdline_limits_early(void) +{ + unsigned int possible = nr_cpu_ids; + + /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ + if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) + possible = 1; + + /* 'possible_cpus=N' */ + possible = min_t(unsigned int, max_possible_cpus, possible); + + if (possible < nr_cpu_ids) { + pr_info("Limiting to %u possible CPUs\n", possible); + set_nr_cpu_ids(possible); + } +} + +static __init bool restrict_to_up(void) +{ + if (!smp_found_config || ioapic_is_disabled) + return true; + /* + * XEN PV is special as it does not advertise the local APIC + * properly, but provides a fake topology for it so that the + * infrastructure works. So don't apply the restrictions vs. APIC + * here. + */ + if (xen_pv_domain()) + return false; + + return apic_is_disabled; +} + +void __init topology_init_possible_cpus(void) +{ + unsigned int assigned = topo_info.nr_assigned_cpus; + unsigned int disabled = topo_info.nr_disabled_cpus; + unsigned int cnta, cntb, cpu, allowed = 1; + unsigned int total = assigned + disabled; + u32 apicid, firstid; + + /* + * If there was no APIC registered, then fake one so that the + * topology bitmap is populated. That ensures that the code below + * is valid and the various query interfaces can be used + * unconditionally. This does not affect the actual APIC code in + * any way because either the local APIC address has not been + * registered or the local APIC was disabled on the command line. + */ + if (topo_info.boot_cpu_apic_id == BAD_APICID) + topology_register_boot_apic(0); + + if (!restrict_to_up()) { + if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { + disabled += assigned - nr_cpu_ids; + assigned = nr_cpu_ids; + } + allowed = min_t(unsigned int, total, nr_cpu_ids); + } + + if (total > allowed) + pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed); + + assigned = min_t(unsigned int, allowed, assigned); + disabled = allowed - assigned; + + topo_info.nr_assigned_cpus = assigned; + topo_info.nr_disabled_cpus = disabled; + + total_cpus = allowed; + set_nr_cpu_ids(allowed); + + cnta = domain_weight(TOPO_PKG_DOMAIN); + cntb = domain_weight(TOPO_DIE_DOMAIN); + __max_logical_packages = cnta; + __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); + + pr_info("Max. logical packages: %3u\n", cnta); + pr_info("Max. logical dies: %3u\n", cntb); + pr_info("Max. dies per package: %3u\n", __max_dies_per_package); + + cnta = domain_weight(TOPO_CORE_DOMAIN); + cntb = domain_weight(TOPO_SMT_DOMAIN); + /* + * Can't use order delta here as order(cnta) can be equal + * order(cntb) even if cnta != cntb. + */ + __max_threads_per_core = DIV_ROUND_UP(cntb, cnta); + pr_info("Max. threads per core: %3u\n", __max_threads_per_core); + + firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC); + __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. cores per package: %3u\n", __num_cores_per_package); + __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. threads per package: %3u\n", __num_threads_per_package); + + pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled); + if (topo_info.nr_rejected_cpus) + pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus); + + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + /* Assign CPU numbers to non-present CPUs */ + for (apicid = 0; disabled; disabled--, apicid++) { + apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map, + MAX_LOCAL_APIC, apicid); + if (apicid >= MAX_LOCAL_APIC) + break; + cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid; + } + + for (cpu = 0; cpu < allowed; cpu++) { + apicid = cpuid_to_apicid[cpu]; + + set_cpu_possible(cpu, true); + + if (apicid == BAD_APICID) + continue; + + cpu_mark_primary_thread(cpu, apicid); + set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map)); + } +} + +/* + * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed. + */ +void __init topology_reset_possible_cpus_up(void) +{ + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + if (topo_info.boot_cpu_apic_id != BAD_APICID) + set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map); +} + +static int __init setup_possible_cpus(char *str) +{ + get_option(&str, &max_possible_cpus); return 0; } +early_param("possible_cpus", setup_possible_cpus); +#endif diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h new file mode 100644 index 0000000000..37326297f8 --- /dev/null +++ b/arch/x86/kernel/cpu/topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_TOPOLOGY_H +#define ARCH_X86_TOPOLOGY_H + +struct topo_scan { + struct cpuinfo_x86 *c; + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_ncpus[TOPO_MAX_DOMAIN]; + + /* Legacy CPUID[1]:EBX[23:16] number of logical processors */ + unsigned int ebx1_nproc_shift; + + /* AMD specific node ID which cannot be mapped into APIC space. */ + u16 amd_nodes_per_pkg; + u16 amd_node_id; +}; + +void cpu_init_topology(struct cpuinfo_x86 *c); +void cpu_parse_topology(struct cpuinfo_x86 *c); +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus); +bool cpu_parse_topology_ext(struct topo_scan *tscan); +void cpu_parse_topology_amd(struct topo_scan *tscan); +void cpu_topology_fixup_amd(struct topo_scan *tscan); + +static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid >> x86_topo_system.dom_shifts[dom - 1]; +} + +static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom) +{ + if (dom != TOPO_SMT_DOMAIN) + apicid >>= x86_topo_system.dom_shifts[dom - 1]; + return apicid & (x86_topo_system.dom_size[dom] - 1); +} + +static inline u32 topo_domain_mask(enum x86_topology_domains dom) +{ + return (1U << x86_topo_system.dom_shifts[dom]) - 1; +} + +/* + * Update a domain level after the fact without propagating. Used to fixup + * broken CPUID enumerations. + */ +static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + tscan->dom_shifts[dom] = shift; + tscan->dom_ncpus[dom] = ncpus; +} + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level); +#else +static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + return 1; +} +#endif + +#endif /* ARCH_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c new file mode 100644 index 0000000000..5ee6373d4d --- /dev/null +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +static bool parse_8000_0008(struct topo_scan *tscan) +{ + struct { + // ecx + u32 cpu_nthreads : 8, // Number of physical threads - 1 + : 4, // Reserved + apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID + perf_tsc_len : 2, // Performance time-stamp counter size + : 14; // Reserved + } ecx; + unsigned int sft; + + if (tscan->c->extended_cpuid_level < 0x80000008) + return false; + + cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx); + + /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */ + sft = ecx.apicid_coreid_len; + if (!sft) + sft = get_count_order(ecx.cpu_nthreads + 1); + + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); + return true; +} + +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) +{ + /* + * Starting with Fam 17h the DIE domain could probably be used to + * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps + * suggests it's the topmost bit(s) of the CPU cores area, but + * that's guess work and neither enumerated nor documented. + * + * Up to Fam 16h this does not work at all and the legacy node ID + * has to be used. + */ + tscan->amd_nodes_per_pkg = nr_nodes; + tscan->amd_node_id = node_id; +} + +static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) +{ + struct { + // eax + u32 ext_apic_id : 32; // Extended APIC ID + // ebx + u32 core_id : 8, // Unique per-socket logical core unit ID + core_nthreads : 8, // #Threads per core (zero-based) + : 16; // Reserved + // ecx + u32 node_id : 8, // Node (die) ID of invoking logical CPU + nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket + : 21; // Reserved + // edx + u32 : 32; // Reserved + } leaf; + + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) + return false; + + cpuid_leaf(0x8000001e, &leaf); + + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + + /* + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. Only valid for family >= 0x17. + */ + if (!has_0xb && tscan->c->x86 >= 0x17) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ + unsigned int nthreads = leaf.core_nthreads + 1; + + topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + } + + store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); + + if (tscan->c->x86_vendor == X86_VENDOR_AMD) { + if (tscan->c->x86 == 0x15) + tscan->c->topo.cu_id = leaf.core_id; + + cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id); + } else { + /* + * Package ID is ApicId[6..] on certain Hygon CPUs. See + * commit e0ceeae708ce for explanation. The topology info + * is screwed up: The package shift is always 6 and the + * node ID is bit [4:5]. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) { + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6, + tscan->dom_ncpus[TOPO_CORE_DOMAIN]); + } + cacheinfo_hygon_init_llc_id(tscan->c); + } + return true; +} + +static void parse_fam10h_node_id(struct topo_scan *tscan) +{ + union { + struct { + u64 node_id : 3, + nodes_per_pkg : 3, + unused : 58; + }; + u64 msr; + } nid; + + if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) + return; + + rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); + tscan->c->topo.llc_id = nid.node_id; +} + +static void legacy_set_llc(struct topo_scan *tscan) +{ + unsigned int apicid = tscan->c->topo.initial_apicid; + + /* If none of the parsers set LLC ID then use the die ID for it. */ + if (tscan->c->topo.llc_id == BAD_APICID) + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } +} + +static void parse_topology_amd(struct topo_scan *tscan) +{ + bool has_0xb = false; + + /* + * If the extended topology leaf 0x8000_001e is available + * try to get SMT and CORE shift from leaf 0xb first, then + * try to get the CORE shift from leaf 0x8000_0008. + */ + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) + has_0xb = cpu_parse_topology_ext(tscan); + + if (!has_0xb && !parse_8000_0008(tscan)) + return; + + /* Prefer leaf 0x8000001e if available */ + if (parse_8000_001e(tscan, has_0xb)) + return; + + /* Try the NODEID MSR */ + parse_fam10h_node_id(tscan); +} + +void cpu_parse_topology_amd(struct topo_scan *tscan) +{ + tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); + parse_topology_amd(tscan); + legacy_set_llc(tscan); + + if (tscan->amd_nodes_per_pkg > 1) + set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); +} + +void cpu_topology_fixup_amd(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + + /* + * Adjust the core_id relative to the node when there is more than + * one node. + */ + if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1) + c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c new file mode 100644 index 0000000000..9a6069e713 --- /dev/null +++ b/arch/x86/kernel/cpu/topology_common.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <xen/xen.h> + +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/smp.h> + +#include "cpu.h" + +struct x86_topology_system x86_topo_system __ro_after_init; +EXPORT_SYMBOL_GPL(x86_topo_system); + +unsigned int __amd_nodes_per_pkg __ro_after_init; +EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); + +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + topology_update_dom(tscan, dom, shift, ncpus); + + /* Propagate to the upper levels */ + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1]; + tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1]; + } +} + +static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) +{ + struct { + u32 cache_type : 5, + unused : 21, + ncores : 6; + } eax; + + if (c->cpuid_level < 4) + return 1; + + cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax); + if (!eax.cache_type) + return 1; + + return eax.ncores + 1; +} + +static void parse_legacy(struct topo_scan *tscan) +{ + unsigned int cores, core_shift, smt_shift = 0; + struct cpuinfo_x86 *c = tscan->c; + + cores = parse_num_cores_legacy(c); + core_shift = get_count_order(cores); + + if (cpu_has(c, X86_FEATURE_HT)) { + if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift)) + smt_shift = tscan->ebx1_nproc_shift - core_shift; + /* + * The parser expects leaf 0xb/0x1f format, which means + * the number of logical processors at core level is + * counting threads. + */ + core_shift += smt_shift; + cores <<= smt_shift; + } + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores); +} + +static bool fake_topology(struct topo_scan *tscan) +{ + /* + * Preset the CORE level shift for CPUID less systems and XEN_PV, + * which has useless CPUID information. + */ + topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1); + + return tscan->c->cpuid_level < 1; +} + +static void parse_topology(struct topo_scan *tscan, bool early) +{ + const struct cpuinfo_topology topo_defaults = { + .cu_id = 0xff, + .llc_id = BAD_APICID, + .l2c_id = BAD_APICID, + }; + struct cpuinfo_x86 *c = tscan->c; + struct { + u32 unused0 : 16, + nproc : 8, + apicid : 8; + } ebx; + + c->topo = topo_defaults; + + if (fake_topology(tscan)) + return; + + /* Preset Initial APIC ID from CPUID leaf 1 */ + cpuid_leaf_reg(1, CPUID_EBX, &ebx); + c->topo.initial_apicid = ebx.apicid; + + /* + * The initial invocation from early_identify_cpu() happens before + * the APIC is mapped or X2APIC enabled. For establishing the + * topology, that's not required. Use the initial APIC ID. + */ + if (early) + c->topo.apicid = c->topo.initial_apicid; + else + c->topo.apicid = read_apic_id(); + + /* The above is sufficient for UP */ + if (!IS_ENABLED(CONFIG_SMP)) + return; + + tscan->ebx1_nproc_shift = get_count_order(ebx.nproc); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (IS_ENABLED(CONFIG_CPU_SUP_AMD)) + cpu_parse_topology_amd(tscan); + break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_ZHAOXIN: + parse_legacy(tscan); + break; + case X86_VENDOR_INTEL: + if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) + parse_legacy(tscan); + break; + case X86_VENDOR_HYGON: + if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) + cpu_parse_topology_amd(tscan); + break; + } +} + +static void topo_set_ids(struct topo_scan *tscan, bool early) +{ + struct cpuinfo_x86 *c = tscan->c; + u32 apicid = c->topo.apicid; + + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); + c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); + + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + } + + /* Package relative core ID */ + c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> + x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN]; + + c->topo.amd_node_id = tscan->amd_node_id; + + if (c->x86_vendor == X86_VENDOR_AMD) + cpu_topology_fixup_amd(tscan); +} + +void cpu_parse_topology(struct cpuinfo_x86 *c) +{ + unsigned int dom, cpu = smp_processor_id(); + struct topo_scan tscan = { .c = c, }; + + parse_topology(&tscan, false); + + if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (c->topo.initial_apicid != c->topo.apicid) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n", + cpu, c->topo.initial_apicid, c->topo.apicid); + } + + if (c->topo.apicid != cpuid_to_apicid[cpu]) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n", + cpu, cpuid_to_apicid[cpu], c->topo.apicid); + } + } + + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) { + if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom]) + continue; + pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom, + tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); + } + + topo_set_ids(&tscan, false); +} + +void __init cpu_init_topology(struct cpuinfo_x86 *c) +{ + struct topo_scan tscan = { .c = c, }; + unsigned int dom, sft; + + parse_topology(&tscan, true); + + /* Copy the shift values and calculate the unit sizes. */ + memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts)); + + dom = TOPO_SMT_DOMAIN; + x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom]; + + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1]; + x86_topo_system.dom_size[dom] = 1U << sft; + } + + topo_set_ids(&tscan, true); + + /* + * AMD systems have Nodes per package which cannot be mapped to + * APIC ID. + */ + __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c new file mode 100644 index 0000000000..e477228cd5 --- /dev/null +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +enum topo_types { + INVALID_TYPE = 0, + SMT_TYPE = 1, + CORE_TYPE = 2, + MAX_TYPE_0B = 3, + MODULE_TYPE = 3, + TILE_TYPE = 4, + DIE_TYPE = 5, + DIEGRP_TYPE = 6, + MAX_TYPE_1F = 7, +}; + +/* + * Use a lookup table for the case that there are future types > 6 which + * describe an intermediate domain level which does not exist today. + */ +static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [MODULE_TYPE] = TOPO_MODULE_DOMAIN, + [TILE_TYPE] = TOPO_TILE_DOMAIN, + [DIE_TYPE] = TOPO_DIE_DOMAIN, + [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, +}; + +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, + unsigned int *last_dom) +{ + unsigned int dom, maxtype; + const unsigned int *map; + struct { + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + // ecx + u32 level : 8, // Current topology level. Same as sub leaf number + type : 8, // Level type. If 0, invalid + : 16; // Reserved + // edx + u32 x2apic_id : 32; // X2APIC ID of the current logical processor + } sl; + + switch (leaf) { + case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; + case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + default: return false; + } + + cpuid_subleaf(leaf, subleaf, &sl); + + if (!sl.num_processors || sl.type == INVALID_TYPE) + return false; + + if (sl.type >= maxtype) { + pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n", + leaf, subleaf, sl.type); + /* + * It really would have been too obvious to make the domain + * type space sparse and leave a few reserved types between + * the points which might change instead of following the + * usual "this can be fixed in software" principle. + */ + dom = *last_dom + 1; + } else { + dom = map[sl.type]; + *last_dom = dom; + } + + if (!dom) { + tscan->c->topo.initial_apicid = sl.x2apic_id; + } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) { + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n", + leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id); + } + + topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors); + return true; +} + +static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf) +{ + unsigned int last_dom; + u32 subleaf; + + /* Read all available subleafs and populate the levels */ + for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++); + + /* If subleaf 0 failed to parse, give up */ + if (!subleaf) + return false; + + /* + * There are machines in the wild which have shift 0 in the subleaf + * 0, but advertise 2 logical processors at that level. They are + * truly SMT. + */ + if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) { + unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n", + leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + } + + set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY); + return true; +} + +bool cpu_parse_topology_ext(struct topo_scan *tscan) +{ + /* Intel: Try leaf 0x1F first. */ + if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) + return true; + + /* Intel/AMD: Fall back to leaf 0xB if available */ + return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); +} diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 415564a652..90eba7eb53 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -71,10 +71,6 @@ static void init_zhaoxin(struct cpuinfo_x86 *c) { early_init_zhaoxin(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index b6b044356f..e74d0c4286 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,6 +26,7 @@ #include <linux/vmalloc.h> #include <linux/memblock.h> +#include <asm/bootparam.h> #include <asm/processor.h> #include <asm/hardirq.h> #include <asm/nmi.h> @@ -40,6 +41,7 @@ #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> +#include <asm/sev.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -59,6 +61,8 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_emergency_stop_pt(); + kdump_sev_callback(); + disable_local_APIC(); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index afd0992409..003e0298f4 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -136,7 +136,7 @@ static void __init dtb_cpu_setup(void) pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id); + topology_register_apic(apic_id, CPU_ACPIID_INVALID, true); } } @@ -283,26 +283,28 @@ void __init x86_flattree_get_config(void) u32 size, map_len; void *dt; - if (!initial_dtb) - return; + if (initial_dtb) { + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); + if (map_len < size) { + early_memunmap(dt, map_len); + dt = early_memremap(initial_dtb, size); + map_len = size; + } - dt = early_memremap(initial_dtb, map_len); - size = fdt_totalsize(dt); - if (map_len < size) { - early_memunmap(dt, map_len); - dt = early_memremap(initial_dtb, size); - map_len = size; + early_init_dt_verify(dt); } - early_init_dt_verify(dt); unflatten_and_copy_device_tree(); - early_memunmap(dt, map_len); + + if (initial_dtb) + early_memunmap(dt, map_len); } #endif -void __init x86_dtb_init(void) +void __init x86_dtb_parse_smp_config(void) { if (!of_have_populated_dt()) return; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f18ca44c90..44a91ef5a2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -410,7 +410,7 @@ static void __die_header(const char *str, struct pt_regs *regs, long err) IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", - IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); } NOKPROBE_SYMBOL(__die_header); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b66f540de0..68b09f718f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -1016,17 +1017,6 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - /* - * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by - * kexec and do not need to be reserved. - */ - if (data->type != SETUP_EFI && - data->type != SETUP_IMA && - data->type != SETUP_RNG_SEED) - e820__range_update_kexec(pa_data, - sizeof(*data) + data->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT) { len += data->len; early_memunmap(data, sizeof(*data)); @@ -1038,12 +1028,9 @@ void __init e820__reserve_setup_data(void) indirect = (struct setup_indirect *)data->data; - if (indirect->type != SETUP_INDIRECT) { + if (indirect->type != SETUP_INDIRECT) e820__range_update(indirect->addr, indirect->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } } pa_data = pa_next; @@ -1051,7 +1038,6 @@ void __init e820__reserve_setup_data(void) } e820__update_table(e820_table); - e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 16f9814c9b..6726e0473d 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -106,6 +106,10 @@ void __init init_espfix_bsp(void) pgd_t *pgd; p4d_t *p4d; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* Install the espfix pud into the kernel page directory */ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); @@ -129,6 +133,10 @@ void init_espfix_ap(int cpu) void *stack_page; pteval_t ptemask; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* We only have to do this once... */ if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index a06b876bbf..edbafc5940 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,6 +2,8 @@ /* * x86 FPU bug checks: */ +#include <linux/printk.h> + #include <asm/cpufeature.h> #include <asm/fpu/api.h> diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c new file mode 100644 index 0000000000..4bcd8791ad --- /dev/null +++ b/arch/x86/kernel/fred.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/tlbflush.h> +#include <asm/traps.h> + +/* #DB in the kernel would imply the use of a kernel debugger. */ +#define FRED_DB_STACK_LEVEL 1UL +#define FRED_NMI_STACK_LEVEL 2UL +#define FRED_MC_STACK_LEVEL 2UL +/* + * #DF is the highest level because a #DF means "something went wrong + * *while delivering an exception*." The number of cases for which that + * can happen with FRED is drastically reduced and basically amounts to + * "the stack you pointed me to is broken." Thus, always change stacks + * on #DF, which means it should be at the highest level. + */ +#define FRED_DF_STACK_LEVEL 3UL + +#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) + +void cpu_init_fred_exceptions(void) +{ + /* When FRED is enabled by default, remove this log message */ + pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + + wrmsrl(MSR_IA32_FRED_CONFIG, + /* Reserve for CALL emulation */ + FRED_CONFIG_REDZONE | + FRED_CONFIG_INT_STKLVL(0) | + FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrl(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 12df54ff0e..70139d9d2e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -307,7 +307,8 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) +#define RET_SIZE \ + (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index dc09560679..a817ed0724 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -22,6 +22,8 @@ #include <linux/cc_platform.h> #include <linux/pgtable.h> +#include <asm/asm.h> +#include <asm/page_64.h> #include <asm/processor.h> #include <asm/proto.h> #include <asm/smp.h> @@ -67,42 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -/* - * GDT used on the boot CPU before switching to virtual addresses. - */ -static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), -}; - -/* - * Address needs to be set at runtime because it references the startup_gdt - * while the kernel still uses a direct mapping. - */ -static struct desc_ptr startup_gdt_descr __initdata = { - .size = sizeof(startup_gdt)-1, - .address = 0, -}; - -static void __head *fixup_pointer(void *ptr, unsigned long physaddr) -{ - return ptr - (void *)_text + (void *)physaddr; -} - -static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) +static inline bool check_la57_support(void) { - return fixup_pointer(ptr, physaddr); -} - -#ifdef CONFIG_X86_5LEVEL -static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) -{ - return fixup_pointer(ptr, physaddr); -} + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return false; -static bool __head check_la57_support(unsigned long physaddr) -{ /* * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. @@ -110,21 +81,15 @@ static bool __head check_la57_support(unsigned long physaddr) if (!(native_read_cr4() & X86_CR4_LA57)) return false; - *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; - *fixup_int(&pgdir_shift, physaddr) = 48; - *fixup_int(&ptrs_per_p4d, physaddr) = 512; - *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; - *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; - *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + RIP_REL_REF(__pgtable_l5_enabled) = 1; + RIP_REL_REF(pgdir_shift) = 48; + RIP_REL_REF(ptrs_per_p4d) = 512; + RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; + RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; + RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; return true; } -#else -static bool __head check_la57_support(unsigned long physaddr) -{ - return false; -} -#endif static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) { @@ -173,23 +138,22 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to * boot-time crashes. To work around this problem, every global pointer must - * be adjusted using fixup_pointer(). + * be accessed using RIP_REL_REF(). */ unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long load_delta, *p; + pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); unsigned long pgtable_flags; + unsigned long load_delta; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; - pteval_t *mask_ptr; bool la57; int i; - unsigned int *next_pgt_ptr; - la57 = check_la57_support(physaddr); + la57 = check_la57_support(); /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -200,6 +164,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * and the address I am actually running at. */ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + RIP_REL_REF(phys_base) = load_delta; /* Is the address not 2M aligned? */ if (load_delta & ~PMD_MASK) @@ -210,26 +175,21 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Fixup the physical addresses in the page table */ - pgd = fixup_pointer(early_top_pgt, physaddr); - p = pgd + pgd_index(__START_KERNEL_map); - if (la57) - *p = (unsigned long)level4_kernel_pgt; - else - *p = (unsigned long)level3_kernel_pgt; - *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + pgd = &RIP_REL_REF(early_top_pgt)->pgd; + pgd[pgd_index(__START_KERNEL_map)] += load_delta; if (la57) { - p4d = fixup_pointer(level4_kernel_pgt, physaddr); - p4d[511] += load_delta; + p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; } - pud = fixup_pointer(level3_kernel_pgt, physaddr); - pud[510] += load_delta; - pud[511] += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - pmd = fixup_pointer(level2_fixmap_pgt, physaddr); for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - pmd[i] += load_delta; + RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; /* * Set up the identity mapping for the switchover. These @@ -238,15 +198,14 @@ unsigned long __head __startup_64(unsigned long physaddr, * it avoids problems around wraparound. */ - next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); - pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); - pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + RIP_REL_REF(next_early_pgt) = 2; pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (la57) { - p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], - physaddr); + p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; @@ -267,8 +226,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; /* Filter out unsupported __PAGE_KERNEL_* bits: */ - mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); - pmd_entry &= *mask_ptr; + pmd_entry &= RIP_REL_REF(__supported_pte_mask); pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; @@ -294,7 +252,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * error, causing the BIOS to halt the system. */ - pmd = fixup_pointer(level2_kernel_pgt, physaddr); + pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; /* invalidate pages before the kernel image */ for (i = 0; i < pmd_index((unsigned long)_text); i++) @@ -309,12 +267,6 @@ unsigned long __head __startup_64(unsigned long physaddr, for (; i < PTRS_PER_PMD; i++) pmd[i] &= ~_PAGE_PRESENT; - /* - * Fixup phys_base - remove the memory encryption mask to obtain - * the true physical address. - */ - *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - return sme_postprocess_startup(bp, pmd); } @@ -569,62 +521,52 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) */ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; -static struct desc_ptr bringup_idt_descr = { - .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, - .address = 0, /* Set at runtime */ -}; - -static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +/* This may run while still in the direct mapping */ +static void __head startup_64_load_idt(void *vc_handler) { -#ifdef CONFIG_AMD_MEM_ENCRYPT + struct desc_ptr desc = { + .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; struct idt_data data; - gate_desc desc; - - init_idt_data(&data, n, handler); - idt_init_desc(&desc, &data); - native_write_idt_entry(idt, n, &desc); -#endif -} + gate_desc idt_desc; -/* This runs while still in the direct mapping */ -static void __head startup_64_load_idt(unsigned long physbase) -{ - struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); - gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); - - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - void *handler; - - /* VMM Communication Exception */ - handler = fixup_pointer(vc_no_ghcb, physbase); - set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); } - desc->address = (unsigned long)idt; - native_load_idt(desc); + native_load_idt(&desc); } /* This is used when running on kernel addresses */ void early_setup_idt(void) { - /* VMM Communication Exception */ + void *handler = NULL; + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { setup_ghcb(); - set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + handler = vc_boot_ghcb; } - bringup_idt_descr.address = (unsigned long)bringup_idt_table; - native_load_idt(&bringup_idt_descr); + startup_64_load_idt(handler); } /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_env(unsigned long physbase) +void __head startup_64_setup_gdt_idt(void) { + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)), + .size = GDT_SIZE - 1, + }; + /* Load GDT */ - startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); native_load_gdt(&startup_gdt_descr); /* New GDT is live - reload data segment registers */ @@ -632,5 +574,8 @@ void __head startup_64_setup_env(unsigned long physbase) "movl %%eax, %%ss\n" "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - startup_64_load_idt(physbase); + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = &RIP_REL_REF(vc_no_ghcb); + + startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 487ac57e2c..b50f3641c4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -414,7 +414,7 @@ __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) #define PTI_USER_PGD_FILL 1024 #else @@ -474,7 +474,7 @@ SYM_DATA_START(initial_page_table) # endif .align PAGE_SIZE /* needs to be page-sized too */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * PTI needs another page so sync_initial_pagetable() works correctly * and does not scribble over the data which is placed behind the diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d4918d03ef..d8198fbd70 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -26,6 +26,7 @@ #include <asm/apicdef.h> #include <asm/fixmap.h> #include <asm/smp.h> +#include <asm/thread_info.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -39,7 +40,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) - .text __HEAD .code64 SYM_CODE_START_NOALIGN(startup_64) @@ -66,9 +66,7 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp - - leaq _text(%rip), %rdi + leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx @@ -77,7 +75,7 @@ SYM_CODE_START_NOALIGN(startup_64) shrq $32, %rdx wrmsr - call startup_64_setup_env + call startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -113,13 +111,11 @@ SYM_CODE_START_NOALIGN(startup_64) call __startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(early_top_pgt - __START_KERNEL_map), %rax + leaq early_top_pgt(%rip), %rcx + addq %rcx, %rax #ifdef CONFIG_AMD_MEM_ENCRYPT mov %rax, %rdi - mov %rax, %r14 - - addq phys_base(%rip), %rdi /* * For SEV guests: Verify that the C-bit is correct. A malicious @@ -128,17 +124,23 @@ SYM_CODE_START_NOALIGN(startup_64) * the next RET instruction. */ call sev_verify_cbit +#endif /* - * Restore CR3 value without the phys_base which will be added - * below, before writing %cr3. + * Switch to early_top_pgt which still has the identity mappings + * present. */ - mov %r14, %rax -#endif + movq %rax, %cr3 - jmp 1f + /* Branch to the common startup code at its kernel virtual address */ + ANNOTATE_RETPOLINE_SAFE + jmp *0f(%rip) SYM_CODE_END(startup_64) + __INITRODATA +0: .quad common_startup_64 + + .text SYM_CODE_START(secondary_startup_64) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR @@ -171,22 +173,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Clear %R15 which holds the boot_params pointer on the boot CPU */ - xorq %r15, %r15 + xorl %r15d, %r15d + + /* Derive the runtime physical address of init_top_pgt[] */ + movq phys_base(%rip), %rax + addq $(init_top_pgt - __START_KERNEL_map), %rax /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ #ifdef CONFIG_AMD_MEM_ENCRYPT - movq sme_me_mask, %rax -#else - xorq %rax, %rax + addq sme_me_mask(%rip), %rax #endif + /* + * Switch to the init_top_pgt here, away from the trampoline_pgd and + * unmap the identity mapped ranges. + */ + movq %rax, %cr3 - /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(init_top_pgt - __START_KERNEL_map), %rax -1: +SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + /* + * Create a mask of CR4 bits to preserve. Omit PGE in order to flush + * global 1:1 translations from the TLBs. + * + * From the SDM: + * "If CR4.PGE is changing from 0 to 1, there were no global TLB + * entries before the execution; if CR4.PGE is changing from 1 to 0, + * there will be no global TLB entries after the execution." + */ + movl $(X86_CR4_PAE | X86_CR4_LA57), %edx #ifdef CONFIG_X86_MCE /* * Preserve CR4.MCE if the kernel will enable #MC support. @@ -195,52 +214,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * configured will crash the system regardless of the CR4.MCE value set * here. */ - movq %cr4, %rcx - andl $X86_CR4_MCE, %ecx -#else - movl $0, %ecx + orl $X86_CR4_MCE, %edx #endif + movq %cr4, %rcx + andl %edx, %ecx - /* Enable PAE mode, PSE, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx -#ifdef CONFIG_X86_5LEVEL - testb $1, __pgtable_l5_enabled(%rip) - jz 1f - orl $X86_CR4_LA57, %ecx -1: -#endif + /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */ + btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 - /* Setup early boot stage 4-/5-level pagetables. */ - addq phys_base(%rip), %rax - /* - * Switch to new page-table - * - * For the boot CPU this switches to early_top_pgt which still has the - * identity mappings present. The secondary CPUs will switch to the - * init_top_pgt here, away from the trampoline_pgd and unmap the - * identity mapped ranges. + * Set CR4.PGE to re-enable global translations. */ - movq %rax, %cr3 - - /* - * Do a global TLB flush after the CR3 switch to make sure the TLB - * entries from the identity mapping are flushed. - */ - movq %cr4, %rcx - movq %rcx, %rax - xorq $X86_CR4_PGE, %rcx + btsl $X86_CR4_PGE_BIT, %ecx movq %rcx, %cr4 - movq %rax, %cr4 - - /* Ensure I am executing from virtual addresses */ - movq $1f, %rax - ANNOTATE_RETPOLINE_SAFE - jmp *%rax -1: - UNWIND_HINT_END_OF_STACK - ANNOTATE_NOENDBR // above #ifdef CONFIG_SMP /* @@ -297,7 +284,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) .Llookup_AP: /* EAX contains the APIC ID of the current CPU */ - xorq %rcx, %rcx + xorl %ecx, %ecx leaq cpuid_to_apicid(%rip), %rbx .Lfind_cpunr: @@ -428,39 +415,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq %r15, %rdi .Ljump_to_C_code: - /* - * Jump to run C code and to be on a real kernel address. - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address, this is only possible as indirect - * jump. In addition we need to ensure %cs is set so we make this - * a far return. - * - * Note: do not change to far jump indirect with 64bit offset. - * - * AMD does not support far jump indirect with 64bit offset. - * AMD64 Architecture Programmer's Manual, Volume 3: states only - * JMP FAR mem16:16 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * JMP FAR mem16:32 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * - * Intel64 does support 64bit offset. - * Software Developer Manual Vol 2: states: - * FF /5 JMP m16:16 Jump far, absolute indirect, - * address given in m16:16 - * FF /5 JMP m16:32 Jump far, absolute indirect, - * address given in m16:32. - * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, - * address given in m16:64. - */ - pushq $.Lafter_lret # put return address on stack for unwinder xorl %ebp, %ebp # clear frame pointer - movq initial_code(%rip), %rax - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq -.Lafter_lret: - ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + callq *initial_code(%rip) + ud2 SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -477,7 +435,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code @@ -622,7 +580,7 @@ SYM_CODE_END(vc_no_ghcb) #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not * ever go out to userspace with these, so we do not @@ -655,7 +613,8 @@ SYM_CODE_END(vc_no_ghcb) .balign 4 SYM_DATA_START_PTI_ALIGNED(early_top_pgt) - .fill 512,8,0 + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a38d0c93a6..c96ae8fee9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -568,7 +568,7 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) fwspec.param_count = 1; fwspec.param[0] = hpet_id; - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { irq_domain_free_fwnode(fn); kfree(domain_info); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 660b601f1d..fc37c8d83d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_LOCAL_APIC INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), -# ifdef CONFIG_HAVE_KVM +# if IS_ENABLED(CONFIG_KVM) INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), @@ -337,7 +337,7 @@ void idt_invalidate(void) load_idt(&idt); } -void __init alloc_intr_gate(unsigned int n, const void *addr) +void __init idt_install_sysvec(unsigned int n, const void *function) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; @@ -346,5 +346,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr) return; if (!WARN_ON(test_and_set_bit(n, system_vectors))) - set_intr_gate(n, addr); + set_intr_gate(n, function); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 11761c1245..35fde01079 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -164,7 +164,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) seq_printf(p, "%*s: ", prec, "PIN"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); @@ -290,7 +290,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c683666876..f79c5edc0b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/fred.h> #include <asm/prom.h> /* @@ -96,7 +97,11 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - idt_setup_apic_and_irq_gates(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_complete_exception_setup(); + else + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 578d16fc04..df33786061 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -89,7 +89,7 @@ static void __init jailhouse_x2apic_init(void) #endif } -static void __init jailhouse_get_smp_config(unsigned int early) +static void __init jailhouse_parse_smp_config(void) { struct ioapic_domain_cfg ioapic_cfg = { .type = IOAPIC_DOMAIN_STRICT, @@ -102,7 +102,7 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) - generic_processor_info(setup_data.v1.cpu_ids[cpu]); + topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true); smp_found_config = 1; @@ -201,21 +201,23 @@ static void __init jailhouse_init_platform(void) struct setup_data header; void *mapping; - x86_init.irqs.pre_vector_init = x86_init_noop; - x86_init.timers.timer_init = jailhouse_timer_init; - x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; - x86_init.pci.arch_init = jailhouse_pci_arch_init; + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; - x86_platform.calibrate_cpu = jailhouse_get_tsc; - x86_platform.calibrate_tsc = jailhouse_get_tsc; - x86_platform.get_wallclock = jailhouse_get_wallclock; - x86_platform.legacy.rtc = 0; - x86_platform.legacy.warm_reset = 0; - x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; - legacy_pic = &null_legacy_pic; + legacy_pic = &null_legacy_pic; - machine_ops.emergency_restart = jailhouse_no_restart; + machine_ops.emergency_restart = jailhouse_no_restart; while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2a422e00ed..68530fad05 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { ret = crash_setup_memmap_entries(image, params); if (ret) return ret; } else +#endif setup_e820_entries(params); nr_e820_entries = params->e820_entries; @@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(-EINVAL); } +#ifdef CONFIG_CRASH_DUMP /* Allocate and load backup region */ if (image->type == KEXEC_TYPE_CRASH) { ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); } +#endif /* * Load purgatory. For 64bit entry point, purgatory code can be @@ -503,7 +507,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz = kernel_len - kern16_size; kbuf.memsz = PAGE_ALIGN(header->init_size); kbuf.buf_align = header->kernel_alignment; - kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + if (header->pref_address < MIN_KERNEL_LOAD_ADDR) + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + else + kbuf.buf_min = header->pref_address; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c993521d49..e772276f5a 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -78,7 +78,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(struct insn *insn, void *orig_addr); +extern bool can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a6a3475e1d..d0e49bd7c6 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -137,14 +137,14 @@ NOKPROBE_SYMBOL(synthesize_relcall); * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(struct insn *insn, void *addr) +bool can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; insn_byte_t prefix; int i; if (search_exception_tables((unsigned long)addr)) - return 0; /* Page fault may occur on this address. */ + return false; /* Page fault may occur on this address. */ /* 2nd-byte opcode */ if (insn->opcode.nbytes == 2) @@ -152,7 +152,7 @@ int can_boost(struct insn *insn, void *addr) (unsigned long *)twobyte_is_boostable); if (insn->opcode.nbytes != 1) - return 0; + return false; for_each_insn_prefix(insn, i, prefix) { insn_attr_t attr; @@ -160,7 +160,7 @@ int can_boost(struct insn *insn, void *addr) attr = inat_get_opcode_attribute(prefix); /* Can't boost Address-size override prefix and CS override prefix */ if (prefix == 0x2e || inat_is_address_size_prefix(attr)) - return 0; + return false; } opcode = insn->opcode.bytes[0]; @@ -169,24 +169,35 @@ int can_boost(struct insn *insn, void *addr) case 0x62: /* bound */ case 0x70 ... 0x7f: /* Conditional jumps */ case 0x9a: /* Call far */ - case 0xc0 ... 0xc1: /* Grp2 */ case 0xcc ... 0xce: /* software exceptions */ - case 0xd0 ... 0xd3: /* Grp2 */ case 0xd6: /* (UD) */ case 0xd8 ... 0xdf: /* ESC */ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ case 0xe8 ... 0xe9: /* near Call, JMP */ case 0xeb: /* Short JMP */ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + /* ... are not boostable */ + return false; + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xd0 ... 0xd3: /* Grp2 */ + /* + * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved. + */ + return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110; case 0xf6 ... 0xf7: /* Grp3 */ + /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */ + return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001; case 0xfe: /* Grp4 */ - /* ... are not boostable */ - return 0; + /* Only INC and DEC are boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001; case 0xff: /* Grp5 */ - /* Only indirect jmp is boostable */ - return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; + /* Only INC, DEC, and indirect JMP are boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100; default: - return 1; + return true; } } @@ -252,21 +263,40 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add return __recover_probed_insn(buf, addr); } -/* Check if paddr is at an instruction boundary */ -static int can_probe(unsigned long paddr) +/* Check if insn is INT or UD */ +static inline bool is_exception_insn(struct insn *insn) +{ + /* UD uses 0f escape */ + if (insn->opcode.bytes[0] == 0x0f) { + /* UD0 / UD1 / UD2 */ + return insn->opcode.bytes[1] == 0xff || + insn->opcode.bytes[1] == 0xb9 || + insn->opcode.bytes[1] == 0x0b; + } + + /* INT3 / INT n / INTO / INT1 */ + return insn->opcode.bytes[0] == 0xcc || + insn->opcode.bytes[0] == 0xcd || + insn->opcode.bytes[0] == 0xce || + insn->opcode.bytes[0] == 0xf1; +} + +/* + * Check if paddr is at an instruction boundary and that instruction can + * be probed + */ +static bool can_probe(unsigned long paddr) { unsigned long addr, __addr, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) - return 0; + return false; /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { - int ret; - /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the @@ -277,11 +307,10 @@ static int can_probe(unsigned long paddr) */ __addr = recover_probed_instruction(buf, addr); if (!__addr) - return 0; + return false; - ret = insn_decode_kernel(&insn, (void *)__addr); - if (ret < 0) - return 0; + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return false; #ifdef CONFIG_KGDB /* @@ -290,10 +319,26 @@ static int can_probe(unsigned long paddr) */ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && kgdb_has_hit_break(addr)) - return 0; + return false; #endif addr += insn.length; } + + /* Check if paddr is at an instruction boundary */ + if (addr != paddr) + return false; + + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return false; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return false; + + /* INT and UD are special and should not be kprobed */ + if (is_exception_insn(&insn)) + return false; + if (IS_ENABLED(CONFIG_CFI_CLANG)) { /* * The compiler generates the following instruction sequence @@ -308,13 +353,6 @@ static int can_probe(unsigned long paddr) * Also, these movl and addl are used for showing expected * type. So those must not be touched. */ - __addr = recover_probed_instruction(buf, addr); - if (!__addr) - return 0; - - if (insn_decode_kernel(&insn, (void *)__addr) < 0) - return 0; - if (insn.opcode.value == 0xBA) offset = 12; else if (insn.opcode.value == 0x3) @@ -324,11 +362,11 @@ static int can_probe(unsigned long paddr) /* This movl/addl is used for decoding CFI. */ if (is_cfi_trap(addr + offset)) - return 0; + return false; } out: - return (addr == paddr); + return true; } /* If x86 supports IBT (ENDBR) it must be skipped. */ diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb..15af7e98e1 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 517821b483..36d6809c6c 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr) * However, the kernel built with retpolines or IBT has jump * tables disabled so the check can be skipped altogether. */ - if (!IS_ENABLED(CONFIG_RETPOLINE) && + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && insn_is_indirect_jump(&insn)) return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 428ee74002..7f0732bc0c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } @@ -770,7 +771,7 @@ static struct notifier_block kvm_pv_reboot_nb = { * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP static void kvm_crash_shutdown(struct pt_regs *regs) { kvm_guest_cpu_offline(true); @@ -830,7 +831,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP @@ -853,7 +854,7 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = kvm_crash_shutdown; #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5bb395551c..5b2c15214a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -154,15 +154,15 @@ static int kvm_cs_enable(struct clocksource *cs) return 0; } -struct clocksource kvm_clock = { +static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .id = CSID_X86_KVM_CLK, .enable = kvm_cs_enable, }; -EXPORT_SYMBOL_GPL(kvm_clock); static void kvm_register_clock(char *txt) { diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 7a814b4140..0f19ef355f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -184,7 +184,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, @@ -377,7 +377,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } -#else /* !CONFIG_PAGE_TABLE_ISOLATION */ +#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) @@ -388,11 +388,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index bc0a5348b4..cc0f7f70b1 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -295,8 +295,15 @@ void machine_kexec_cleanup(struct kimage *image) void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; - void *control_page; + unsigned int host_mem_enc_active; int save_ftrace_enabled; + void *control_page; + + /* + * This must be done before load_segments() since if call depth tracking + * is used then GS must be valid to make any function calls. + */ + host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -358,7 +365,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -508,6 +515,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif /* CONFIG_KEXEC_FILE */ +#ifdef CONFIG_CRASH_DUMP + static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { @@ -552,6 +561,7 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } +#endif /* * During a traditional boot under SME, SME will encrypt the kernel, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b223922248..e89171b034 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -36,6 +36,8 @@ * Checksum an MP configuration block. */ +static unsigned int num_procs __initdata; + static int __init mpf_checksum(unsigned char *mp, int len) { int sum = 0; @@ -50,16 +52,15 @@ static void __init MP_processor_info(struct mpc_cpu *m) { char *bootup_cpu = ""; - if (!(m->cpuflag & CPU_ENABLED)) { - disabled_cpus++; + topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED); + if (!(m->cpuflag & CPU_ENABLED)) return; - } if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(m->apicid); + num_procs++; } #ifdef CONFIG_X86_IO_APIC @@ -196,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; - /* Initialize the lapic mapping */ - if (!acpi_lapic) - register_lapic_address(mpc->lapic); - - if (early) + if (early) { + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); return 1; + } /* Now process the configuration blocks. */ while (count < mpc->length) { @@ -236,9 +237,9 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) } } - if (!num_processors) + if (!num_procs && !acpi_lapic) pr_err("MPTABLE: no processors registered!\n"); - return num_processors; + return num_procs || acpi_lapic; } #ifdef CONFIG_X86_IO_APIC @@ -473,7 +474,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -void __init default_get_smp_config(unsigned int early) +static __init void mpparse_get_smp_config(unsigned int early) { struct mpf_intel *mpf; @@ -529,8 +530,8 @@ void __init default_get_smp_config(unsigned int early) } else BUG(); - if (!early) - pr_info("Processors: %d\n", num_processors); + if (!early && !acpi_lapic) + pr_info("Processors: %d\n", num_procs); /* * Only use the first configuration found. */ @@ -538,6 +539,16 @@ out: early_memunmap(mpf, sizeof(*mpf)); } +void __init mpparse_parse_early_smp_config(void) +{ + mpparse_get_smp_config(true); +} + +void __init mpparse_parse_smp_config(void) +{ + mpparse_get_smp_config(false); +} + static void __init smp_reserve_memory(struct mpf_intel *mpf) { memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); @@ -587,7 +598,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) return ret; } -void __init default_find_smp_config(void) +void __init mpparse_find_mptable(void) { unsigned int address; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6da2cfa23c..ed163c8c86 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -35,6 +35,7 @@ #include <asm/nospec-branch.h> #include <asm/microcode.h> #include <asm/sev.h> +#include <asm/fred.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -303,13 +304,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); - pr_emerg("Dazed and confused, but trying to continue\n"); + pr_emerg_ratelimited("Dazed and confused, but trying to continue\n"); } NOKPROBE_SYMBOL(unknown_nmi_error); @@ -502,7 +503,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi) if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) raw_atomic_long_inc(&nsp->idt_calls); - if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) { + if (arch_cpu_is_offline(smp_processor_id())) { if (microcode_nmi_handler_enabled()) microcode_offline_nmi_handler(); return; @@ -579,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); static char *nmi_check_stall_msg[] = { /* */ -/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */ /* | +------ cpu_is_offline(cpu) */ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ /* | | | NMI handler has been invoked. */ @@ -627,27 +628,72 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) nmi_seq = READ_ONCE(nsp->idt_nmi_seq); if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { msgp = "CPU entered NMI handler function, but has not exited"; - } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { - msgp = "CPU is handling NMIs"; - } else { - idx = ((nsp->idt_seq_snap & 0x1) << 2) | + } else if (nsp->idt_nmi_seq_snap == nmi_seq || + nsp->idt_nmi_seq_snap + 1 == nmi_seq) { + idx = ((nmi_seq & 0x1) << 2) | (cpu_is_offline(cpu) << 1) | (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); msgp = nmi_check_stall_msg[idx]; if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) modp = ", but OK because ignore_nmis was set"; - if (nmi_seq & 0x1) - msghp = " (CPU currently in NMI handler function)"; - else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) msghp = " (CPU exited one NMI handler function)"; + else if (nmi_seq & 0x1) + msghp = " (CPU currently in NMI handler function)"; + else + msghp = " (CPU was never in an NMI handler function)"; + } else { + msgp = "CPU is handling NMIs"; } - pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", - __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp); + pr_alert("%s: last activity: %lu jiffies ago.\n", + __func__, j - READ_ONCE(nsp->recv_jiffies)); } } #endif +#ifdef CONFIG_X86_FRED +/* + * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED + * event delivery, i.e., there is no problem of transient states. + * And NMI unblocking only happens when the stack frame indicates + * that so should happen. + * + * Thus, the NMI entry stub for FRED is really straightforward and + * as simple as most exception handlers. As such, #DB is allowed + * during NMI handling. + */ +DEFINE_FREDENTRY_NMI(exc_nmi) +{ + irqentry_state_t irq_state; + + if (arch_cpu_is_offline(smp_processor_id())) { + if (microcode_nmi_handler_enabled()) + microcode_offline_nmi_handler(); + return; + } + + /* + * Save CR2 for eventual restore to cover the case where the NMI + * hits the VMENTER/VMEXIT region where guest CR2 is life. This + * prevents guest state corruption in case that the NMI handler + * takes a page fault. + */ + this_cpu_write(nmi_cr2, read_cr2()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); +} +#endif + void stop_nmi(void) { ignore_nmis++; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ab49ade31b..b8441147eb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -846,31 +846,6 @@ void __noreturn stop_this_cpu(void *dummy) } /* - * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power - * states (local apic timer and TSC stop). - * - * XXX this function is completely buggered vs RCU and tracing. - */ -static void amd_e400_idle(void) -{ - /* - * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E - * gets set after static_cpu_has() places have been converted via - * alternatives. - */ - if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { - default_idle(); - return; - } - - tick_broadcast_enter(); - - default_idle(); - - tick_broadcast_exit(); -} - -/* * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf * exists and whenever MONITOR/MWAIT extensions are present there is at * least one C1 substate. @@ -878,21 +853,22 @@ static void amd_e400_idle(void) * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait * is passed to kernel commandline parameter. */ -static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +static __init bool prefer_mwait_c1_over_halt(void) { + const struct cpuinfo_x86 *c = &boot_cpu_data; u32 eax, ebx, ecx, edx; - /* User has disallowed the use of MWAIT. Fallback to HALT */ - if (boot_option_idle_override == IDLE_NOMWAIT) - return 0; + /* If override is enforced on the command line, fall back to HALT. */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return false; /* MWAIT is not supported on this platform. Fallback to HALT */ if (!cpu_has(c, X86_FEATURE_MWAIT)) - return 0; + return false; - /* Monitor has a bug. Fallback to HALT */ - if (boot_cpu_has_bug(X86_BUG_MONITOR)) - return 0; + /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) + return false; cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); @@ -901,13 +877,13 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) * with EAX=0, ECX=0. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) - return 1; + return true; /* * If MWAIT extensions are available, there should be at least one * MWAIT C1 substate present. */ - return (edx & MWAIT_C1_SUBSTATE_MASK); + return !!(edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -933,26 +909,27 @@ static __cpuidle void mwait_idle(void) __current_clr_polling(); } -void select_idle_routine(const struct cpuinfo_x86 *c) +void __init select_idle_routine(void) { -#ifdef CONFIG_SMP - if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) - pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); -#endif - if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) + if (boot_option_idle_override == IDLE_POLL) { + if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); + return; + } + + /* Required to guard against xen_set_default_idle() */ + if (x86_idle_set()) return; - if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { - pr_info("using AMD E400 aware idle routine\n"); - static_call_update(x86_idle, amd_e400_idle); - } else if (prefer_mwait_c1_over_halt(c)) { + if (prefer_mwait_c1_over_halt()) { pr_info("using mwait in idle threads\n"); static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); static_call_update(x86_idle, tdx_safe_halt); - } else + } else { static_call_update(x86_idle, default_idle); + } } void amd_e400_c1e_apic_setup(void) @@ -985,7 +962,10 @@ void __init arch_post_acpi_subsys_init(void) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE)) + static_branch_enable(&arch_needs_tick_broadcast); + pr_info("System has AMD C1E erratum E400. Workaround enabled.\n"); } static int __init idle_setup(char *str) @@ -998,24 +978,14 @@ static int __init idle_setup(char *str) boot_option_idle_override = IDLE_POLL; cpu_idle_poll_ctrl(true); } else if (!strcmp(str, "halt")) { - /* - * When the boot option of idle=halt is added, halt is - * forced to be used for CPU idle. In such case CPU C2/C3 - * won't be used again. - * To continue to load the CPU idle driver, don't touch - * the boot_option_idle_override. - */ - static_call_update(x86_idle, default_idle); + /* 'idle=halt' HALT for idle. C-states are disabled. */ boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { - /* - * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C1/C2/C3 - * states. - */ + /* 'idle=nomwait' disables MWAIT for idle */ boot_option_idle_override = IDLE_NOMWAIT; - } else - return -1; + } else { + return -EINVAL; + } return 0; } @@ -1030,7 +1000,10 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { - return randomize_page(mm->brk, 0x02000000); + if (mmap_is_ia32()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 708c87b88c..0917c7f257 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -156,13 +156,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -209,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4989095ab7..6d3d20e3e4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,6 +56,7 @@ #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> +#include <asm/fred.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -117,7 +118,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); - printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); @@ -166,7 +167,29 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* + * SWAPGS is no longer needed thus NOT allowed with FRED because + * FRED transitions ensure that an operating system can _always_ + * operate with its own GS base address: + * - For events that occur in ring 3, FRED event delivery swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * - ERETU (the FRED transition that returns to ring 3) also swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * + * And the operating system can still setup the GS segment for a + * user thread without the need of loading a user thread GS with: + * - Using LKGS, available with FRED, to modify other attributes + * of the GS segment without compromising its ability always to + * operate with its own GS base address. + * - Accessing the GS segment base address for a user thread as + * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. + * + * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE + * MSR instead of the GS segment’s descriptor cache. As such, the + * operating system never changes its runtime GS base address. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -191,7 +214,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); @@ -505,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, - unsigned int _cs, unsigned int _ss, unsigned int _ds) + u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); @@ -522,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(ds, _ds); load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - regs->cs = _cs; - regs->ss = _ss; - regs->flags = X86_EFLAGS_IF; + regs->ip = new_ip; + regs->sp = new_sp; + regs->csx = _cs; + regs->ssx = _ss; + /* + * Allow single-step trap and NMI when starting a new task, thus + * once the new task enters user space, single-step trap and NMI + * are both enabled immediately. + * + * Entering a new task is logically speaking a return from a + * system call (exec, fork, clone, etc.). As such, if ptrace + * enables single stepping a single step exception should be + * allowed to trigger immediately upon entering user space. + * This is not optional. + * + * NMI should *never* be disabled in user space. As such, this + * is an optional, opportunistic way to catch errors. + * + * Paranoia: High-order 48 bits above the lowest 16 bit SS are + * discarded by the legacy IRET instruction on all Intel, AMD, + * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, + * even when FRED is not enabled. But we choose the safer side + * to use these bits only when FRED is enabled. + */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + regs->fred_ss.swevent = true; + regs->fred_ss.nmi = true; + } + + regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void @@ -562,14 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(pcpu_hot.hardirq_stack_inuse)); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -623,7 +671,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 830425e6d3..f3130f7627 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP .crash_shutdown = native_machine_crash_shutdown, #endif }; @@ -826,7 +826,7 @@ void machine_halt(void) machine_ops.halt(); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3998109195..e125e059e2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -471,7 +471,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), @@ -970,10 +970,8 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); + /* Find and reserve MPTABLE area */ + x86_init.mpparse.find_mptable(); early_alloc_pgt_buf(); @@ -1091,7 +1089,9 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); + /* Some platforms need the APIC registered for NUMA configuration */ early_acpi_boot_init(); + x86_init.mpparse.early_parse_smp_cfg(); x86_flattree_get_config(); @@ -1132,24 +1132,19 @@ void __init setup_arch(char **cmdline_p) early_quirks(); - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); - x86_dtb_init(); + topology_apply_cmdline_limits_early(); /* - * get boot-time SMP configuration: + * Parse SMP configuration. Try ACPI first and then the platform + * specific parser. */ - get_smp_config(); + acpi_boot_init(); + x86_init.mpparse.parse_smp_cfg(); - /* - * Systems w/o ACPI and mptables might not have it mapped the local - * APIC yet, but prefill_possible_map() might need to access it. - */ + /* Last opportunity to detect and map the local APIC */ init_apic_mappings(); - prefill_possible_map(); + topology_init_possible_cpus(); init_cpu_to_node(); init_gi_nodes(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index be2fa2fc02..b4f8fa0f72 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -9,12 +9,18 @@ * and is included directly into both code-bases. */ +#include <asm/setup_data.h> + #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) #else #undef WARN #define WARN(condition, format...) (!!(condition)) +#define sev_printk(fmt, ...) +#define sev_printk_rtl(fmt, ...) #endif /* I/O parameters for CPUID-related helpers */ @@ -570,6 +576,7 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); + u16 opcode = *(unsigned short *)regs->ip; struct cpuid_leaf leaf; int ret; @@ -577,6 +584,10 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) if (exit_code != SVM_EXIT_CPUID) goto fail; + /* Is it really a CPUID insn? */ + if (opcode != 0xa20f) + goto fail; + leaf.fn = fn; leaf.subfn = subfn; @@ -1167,3 +1178,94 @@ static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) out: return ret; } + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + /* MONITOR and MONITORX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + /* MWAIT and MWAITX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index eb2873b5eb..38ad066179 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -61,6 +61,25 @@ #define AP_INIT_CR0_DEFAULT 0x60000010 #define AP_INIT_MXCSR_DEFAULT 0x1f80 +static const char * const sev_status_feat_names[] = { + [MSR_AMD64_SEV_ENABLED_BIT] = "SEV", + [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES", + [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP", + [MSR_AMD64_SNP_VTOM_BIT] = "vTom", + [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC", + [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI", + [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI", + [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap", + [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS", + [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol", + [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS", + [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC", + [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam", + [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", + [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", + [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", +}; + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -1740,7 +1759,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) { - enum es_result result; + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; switch (exit_code) { case SVM_EXIT_READ_DR7: @@ -2261,3 +2283,19 @@ static int __init snp_init_platform_device(void) return 0; } device_initcall(snp_init_platform_device); + +void sev_show_status(void) +{ + int i; + + pr_info("Status: "); + for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) { + if (sev_status & BIT_ULL(i)) { + if (!sev_status_feat_names[i]) + continue; + + pr_cont("%s ", sev_status_feat_names[i]); + } + } + pr_cont("\n"); +} diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S index 3355e27c69..1ab65f6c6a 100644 --- a/arch/x86/kernel/sev_verify_cbit.S +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit) * The check failed, prevent any forward progress to prevent ROP * attacks, invalidate the stack and go into a hlt loop. */ - xorq %rsp, %rsp + xorl %esp, %esp subq $0x1000, %rsp 2: hlt jmp 2b diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96a771f9f9..18266cc3d9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -148,14 +148,16 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned int cpu = smp_processor_id(); + unsigned int old_cpu, this_cpu; unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ - if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + old_cpu = -1; + this_cpu = smp_processor_id(); + if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu)) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ @@ -186,7 +188,7 @@ static void native_stop_other_cpus(int wait) * NMIs. */ cpumask_copy(&cpus_stop_mask, cpu_online_mask); - cpumask_clear_cpu(cpu, &cpus_stop_mask); + cpumask_clear_cpu(this_cpu, &cpus_stop_mask); if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); @@ -210,6 +212,8 @@ static void native_stop_other_cpus(int wait) * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { + unsigned int cpu; + pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) @@ -282,7 +286,7 @@ struct smp_ops smp_ops = { .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, -#if defined(CONFIG_KEXEC_CORE) +#if defined(CONFIG_CRASH_DUMP) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3f57ce68a3..76bb65045c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -101,10 +101,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -/* Per CPU bogomips and other parameters */ -DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); -EXPORT_PER_CPU_SYMBOL(cpu_info); - /* CPUs which are the primary SMT threads */ struct cpumask __cpu_primary_thread_mask __read_mostly; @@ -125,25 +121,6 @@ struct mwait_cpu_dead { */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); -/* Logical package management. */ -struct logical_maps { - u32 phys_pkg_id; - u32 phys_die_id; - u32 logical_pkg_id; - u32 logical_die_id; -}; - -/* Temporary workaround until the full topology mechanics is in place */ -static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = { - .phys_pkg_id = U32_MAX, - .phys_die_id = U32_MAX, -}; - -unsigned int __max_logical_packages __read_mostly; -EXPORT_SYMBOL(__max_logical_packages); -static unsigned int logical_packages __read_mostly; -static unsigned int logical_die __read_mostly; - /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; @@ -336,106 +313,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/** - * topology_phys_to_logical_pkg - Map a physical package id to a logical - * @phys_pkg: The physical package id to map - * - * Returns logical package id or -1 if not found - */ -int topology_phys_to_logical_pkg(unsigned int phys_pkg) -{ - int cpu; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg) - return per_cpu(logical_maps.logical_pkg_id, cpu); - } - return -1; -} -EXPORT_SYMBOL(topology_phys_to_logical_pkg); - -/** - * topology_phys_to_logical_die - Map a physical die id to logical - * @die_id: The physical die id to map - * @cur_cpu: The CPU for which the mapping is done - * - * Returns logical die id or -1 if not found - */ -static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) -{ - int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id && - per_cpu(logical_maps.phys_die_id, cpu) == die_id) - return per_cpu(logical_maps.logical_die_id, cpu); - } - return -1; -} - -/** - * topology_update_package_map - Update the physical to logical package map - * @pkg: The physical package id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_package_map(unsigned int pkg, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_pkg(pkg); - if (new >= 0) - goto found; - - new = logical_packages++; - if (new != pkg) { - pr_info("CPU %u Converting physical %u to logical package %u\n", - cpu, pkg, new); - } -found: - per_cpu(logical_maps.phys_pkg_id, cpu) = pkg; - per_cpu(logical_maps.logical_pkg_id, cpu) = new; - cpu_data(cpu).topo.logical_pkg_id = new; - return 0; -} -/** - * topology_update_die_map - Update the physical to logical die map - * @die: The die id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_die_map(unsigned int die, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_die(die, cpu); - if (new >= 0) - goto found; - - new = logical_die++; - if (new != die) { - pr_info("CPU %u Converting physical %u to logical die %u\n", - cpu, die, new); - } -found: - per_cpu(logical_maps.phys_die_id, cpu) = die; - per_cpu(logical_maps.logical_die_id, cpu) = new; - cpu_data(cpu).topo.logical_die_id = new; - return 0; -} - -static void __init smp_store_boot_cpu_info(void) -{ - int id = 0; /* CPU 0 */ - struct cpuinfo_x86 *c = &cpu_data(id); - - *c = boot_cpu_data; - c->cpu_index = id; - topology_update_package_map(c->topo.pkg_id, id); - topology_update_die_map(c->topo.die_id, id); - c->initialized = true; -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -488,6 +365,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (c->topo.pkg_id == o->topo.pkg_id && c->topo.die_id == o->topo.die_id && + c->topo.amd_node_id == o->topo.amd_node_id && per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); @@ -509,10 +387,13 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->topo.pkg_id == o->topo.pkg_id && - c->topo.die_id == o->topo.die_id) - return true; - return false; + if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id) + return false; + + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1) + return c->topo.amd_node_id == o->topo.amd_node_id; + + return true; } static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -670,8 +551,8 @@ static void __init build_sched_topology(void) void set_cpu_sibling_map(int cpu) { - bool has_smt = smp_num_siblings > 1; - bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + bool has_smt = __max_threads_per_core > 1; + bool has_mp = has_smt || topology_num_cores_per_package() > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; int i, threads; @@ -1068,9 +949,13 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || - !apic_id_valid(apicid)) { - pr_err("%s: bad cpu %d\n", __func__, cpu); + if (apicid == BAD_APICID || !apic_id_valid(apicid)) { + pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); + return -EINVAL; + } + + if (!test_bit(apicid, phys_cpu_present_map)) { + pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); return -EINVAL; } @@ -1139,42 +1024,22 @@ static __init void disable_smp(void) pr_info("SMP disabled\n"); disable_ioapic_support(); + topology_reset_possible_cpus_up(); - init_cpu_present(cpumask_of(0)); - init_cpu_possible(cpumask_of(0)); - - if (smp_found_config) - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - else - physid_set_mask_of_physid(0, &phys_cpu_present_map); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); cpumask_set_cpu(0, topology_die_cpumask(0)); } -static void __init smp_cpu_index_default(void) -{ - int i; - struct cpuinfo_x86 *c; - - for_each_possible_cpu(i) { - c = &cpu_data(i); - /* mark all to hotplug */ - c->cpu_index = nr_cpu_ids; - } -} - void __init smp_prepare_cpus_common(void) { unsigned int i; - smp_cpu_index_default(); - - /* - * Setup boot CPU information - */ - smp_store_boot_cpu_info(); /* Final full version of the data */ - mb(); + /* Mark all except the boot CPU as hotpluggable */ + for_each_possible_cpu(i) { + if (i) + per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids; + } for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); @@ -1187,6 +1052,11 @@ void __init smp_prepare_cpus_common(void) set_cpu_sibling_map(0); } +void __init smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + #ifdef CONFIG_X86_64 /* Establish whether parallel bringup can be supported. */ bool __init arch_cpuhp_init_parallel_bringup(void) @@ -1265,102 +1135,16 @@ void __init native_smp_prepare_boot_cpu(void) native_pv_lock_init(); } -void __init calculate_max_logical_packages(void) -{ - int ncpus; - - /* - * Today neither Intel nor AMD support heterogeneous systems so - * extrapolate the boot cpu's data to all packages. - */ - ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); - __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); - pr_info("Max logical packages: %u\n", __max_logical_packages); -} - void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); - calculate_max_logical_packages(); build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); } -static int __initdata setup_possible_cpus = -1; -static int __init _setup_possible_cpus(char *str) -{ - get_option(&str, &setup_possible_cpus); - return 0; -} -early_param("possible_cpus", _setup_possible_cpus); - - -/* - * cpu_possible_mask should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and don't expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_mask on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * - Ashok Raj - * - * Three ways to find out the number of additional hotplug CPUs: - * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with possible_cpus=NUM - * - Otherwise don't reserve additional CPUs. - * We do this because additional CPUs waste a lot of memory. - * -AK - */ -__init void prefill_possible_map(void) -{ - int i, possible; - - i = setup_max_cpus ?: 1; - if (setup_possible_cpus == -1) { - possible = num_processors; -#ifdef CONFIG_HOTPLUG_CPU - if (setup_max_cpus) - possible += disabled_cpus; -#else - if (possible > i) - possible = i; -#endif - } else - possible = setup_possible_cpus; - - total_cpus = max_t(int, possible, num_processors + disabled_cpus); - - /* nr_cpu_ids could be reduced via nr_cpus= */ - if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", - possible, nr_cpu_ids); - possible = nr_cpu_ids; - } - -#ifdef CONFIG_HOTPLUG_CPU - if (!setup_max_cpus) -#endif - if (possible > i) { - pr_warn("%d Processors exceeds max_cpus limit of %u\n", - possible, setup_max_cpus); - possible = i; - } - - set_nr_cpu_ids(possible); - - pr_info("Allowing %d CPUs, %d hotplug CPUs\n", - possible, max_t(int, possible - num_processors, 0)); - - reset_cpu_possible_mask(); - - for (i = 0; i < possible; i++) - set_cpu_possible(i, true); -} - /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 77a9316da4..4eefaac64c 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,7 +172,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 8e2b2552b5..3e2952679b 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -6,7 +6,9 @@ #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> + #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/mmu_context.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index c783aeb37d..cb9fa1d5c6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -52,13 +52,6 @@ static unsigned long get_align_bits(void) return va_align.bits & get_align_mask(); } -unsigned long align_vdso_addr(unsigned long addr) -{ - unsigned long align_mask = get_align_mask(); - addr = (addr + align_mask) & ~align_mask; - return addr | get_align_bits(); -} - static int __init control_va_addr_alignment(char *str) { /* guard against enabling this on other CPU families */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c3b2f863ac..4fa0b17e50 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> +#include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> @@ -773,7 +774,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -791,7 +792,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(pcpu_hot.top_of_stack); + sp = current_top_of_stack(); goto sync; } @@ -935,8 +936,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -948,6 +948,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -977,7 +982,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1009,8 +1015,7 @@ out: local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1094,6 +1099,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_clear_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) @@ -1369,8 +1402,34 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif +/* Do not enable FRED by default yet. */ +static bool enable_fred __ro_after_init = false; + +#ifdef CONFIG_X86_FRED +static int __init fred_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + return 0; + + if (!strcmp(str, "on")) + enable_fred = true; + else if (!strcmp(str, "off")) + enable_fred = false; + else + pr_warn("invalid FRED option: 'fred=%s'\n", str); + return 0; +} +early_param("fred", fred_setup); +#endif + void __init trap_init(void) { + if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) + setup_clear_cpu_cap(X86_FEATURE_FRED); + /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); @@ -1379,7 +1438,10 @@ void __init trap_init(void) /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 15f97c0abc..5a69a49acc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -53,7 +53,7 @@ static int __read_mostly tsc_force_recalibrate; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; -static struct clocksource *art_related_clocksource; +static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ @@ -652,7 +652,7 @@ success: } /** - * native_calibrate_tsc + * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) @@ -1168,6 +1168,7 @@ static struct clocksource clocksource_tsc_early = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, + .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1190,6 +1191,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, + .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1309,8 +1311,10 @@ struct system_counterval_t convert_art_to_tsc(u64 art) do_div(tmp, art_to_tsc_denominator); res += tmp + art_to_tsc_offset; - return (struct system_counterval_t) {.cs = art_related_clocksource, - .cycles = res}; + return (struct system_counterval_t) { + .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, + .cycles = res, + }; } EXPORT_SYMBOL(convert_art_to_tsc); @@ -1327,12 +1331,10 @@ EXPORT_SYMBOL(convert_art_to_tsc); * that this flag is set before conversion to TSC is attempted. * * Return: - * struct system_counterval_t - system counter value with the pointer to the - * corresponding clocksource - * @cycles: System counter value - * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparability of two cycle - * values. + * struct system_counterval_t - system counter value with the ID of the + * corresponding clocksource: + * cycles: System counter value + * cs_id: The clocksource ID for validating comparability */ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) @@ -1347,8 +1349,10 @@ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) do_div(tmp, USEC_PER_SEC); res += tmp; - return (struct system_counterval_t) { .cs = art_related_clocksource, - .cycles = res}; + return (struct system_counterval_t) { + .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, + .cycles = res, + }; } EXPORT_SYMBOL(convert_art_ns_to_tsc); @@ -1357,7 +1361,7 @@ static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration - * @work - ignored. + * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is @@ -1455,7 +1459,7 @@ out: goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); @@ -1481,7 +1485,7 @@ static int __init init_tsc_clocksource(void) */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + have_art = true; clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c index 8a89c109e2..5995a74928 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/vmcore_info_32.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/pgtable.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c index 7d255f882a..0dec7d8687 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/vmcore_info_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/pgtable.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a349dbfc6d..56451fd209 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -46,6 +46,7 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; +const_pcpu_hot = pcpu_hot; #if defined(CONFIG_X86_64) /* @@ -132,7 +133,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT SOFTIRQENTRY_TEXT -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE *(.text..__x86.indirect_thunk) *(.text..__x86.return_thunk) #endif @@ -142,7 +143,7 @@ SECTIONS *(.text..__x86.rethunk_untrain) ENTRY_TEXT -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO /* * See the comment above srso_alias_untrain_ret()'s * definition. @@ -267,7 +268,7 @@ SECTIONS } #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* * List of instructions that call/jmp/jcc to retpoline thunks * __x86_indirect_thunk_*(). These instructions can be patched along @@ -504,11 +505,11 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index d3fc017705..73511332bb 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -127,25 +127,12 @@ static void __init vsmp_cap_cpus(void) #endif } -static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return read_apic_id() >> index_msb; -} - -static void vsmp_apic_post_init(void) -{ - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; -} - void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; - x86_platform.apic_post_init = vsmp_apic_post_init; - vsmp_cap_cpus(); set_vsmp_ctl(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3f0718b4a7..d5dc5a9263 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -72,8 +72,9 @@ struct x86_init_ops x86_init __initdata = { .mpparse = { .setup_ioapic_ids = x86_init_noop, - .find_smp_config = default_find_smp_config, - .get_smp_config = default_get_smp_config, + .find_mptable = mpparse_find_mptable, + .early_parse_smp_cfg = mpparse_parse_early_smp_config, + .parse_smp_cfg = mpparse_parse_smp_config, }, .irqs = { |