diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
26 files changed, 1359 insertions, 1186 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4350f6bfc0..93eabf5440 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o +obj-$(CONFIG_DEBUG_FS) += debugfs.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6e4f23f314..2055fb308f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -382,7 +382,7 @@ static int nearby_node(int apicid) #endif /* - * Fix up cpu_core_id for pre-F17h systems to be in the + * Fix up topo::core_id for pre-F17h systems to be in the * [0 .. cores_per_node - 1] range. Not really needed but * kept so as not to break existing setups. */ @@ -394,7 +394,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) return; cus_per_node = c->x86_max_cores / nodes_per_socket; - c->cpu_core_id %= cus_per_node; + c->topo.core_id %= cus_per_node; } /* @@ -405,8 +405,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -414,13 +412,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; if (c->x86 == 0x15) - c->cu_id = ebx & 0xff; + c->topo.cu_id = ebx & 0xff; if (c->x86 >= 0x17) { - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -434,15 +432,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu); + cacheinfo_amd_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -459,15 +456,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c) static void amd_detect_cmp(struct cpuinfo_x86 *c) { unsigned bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; + c->topo.pkg_id = c->topo.initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } u32 amd_get_nodes_per_socket(void) @@ -481,11 +477,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned apicid = c->apicid; + unsigned apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = get_llc_id(cpu); + node = per_cpu_llc_id(cpu); /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -515,7 +511,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -1014,7 +1010,6 @@ static bool cpu_has_zenbleed_microcode(void) default: return false; - break; } if (boot_cpu_data.microcode < good_rev) @@ -1044,6 +1039,8 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_amd(c); /* @@ -1060,7 +1057,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) @@ -1095,6 +1092,14 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which @@ -1157,6 +1162,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has_amd_erratum(c, amd_erratum_1485)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0bc55472f3..bb0ab8466b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd); static DEFINE_MUTEX(spec_ctrl_mutex); -void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) @@ -717,7 +717,7 @@ void update_gds_msr(void) case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; - }; + } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); @@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void) do_cmd_auto: case RETBLEED_CMD_AUTO: - default: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) @@ -1042,8 +1041,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - if (IS_ENABLED(CONFIG_RETHUNK)) - x86_return_thunk = retbleed_return_thunk; + x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1061,7 +1059,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_set_skl_return_thunk(); + + x86_return_thunk = call_depth_return_thunk; break; default: @@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void) spectre_v2_user_ibpb = mode; switch (cmd) { + case SPECTRE_V2_USER_CMD_NONE: + break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: @@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void) case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; - default: - break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", @@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task) static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { + case SPEC_STORE_BYPASS_NONE: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: @@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task) if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - default: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - return PR_SPEC_ENABLE; - return PR_SPEC_NOT_AFFECTED; } + BUG(); } static int ib_prctl_get(struct task_struct *task) @@ -2410,13 +2410,21 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) - goto pred_cmd; + if (cpu_mitigations_off()) + return; + + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. + * + * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); @@ -2437,7 +2445,9 @@ static void __init srso_select_mitigation(void) switch (srso_cmd) { case SRSO_CMD_OFF: - goto pred_cmd; + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; case SRSO_CMD_MICROCODE: if (has_microcode) { @@ -2468,7 +2478,6 @@ static void __init srso_select_mitigation(void) srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; @@ -2480,7 +2489,6 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); - goto pred_cmd; } break; @@ -2492,21 +2500,12 @@ static void __init srso_select_mitigation(void) } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - goto pred_cmd; } break; - - default: - break; } out: pr_info("%s\n", srso_strings[srso_mitigation]); - -pred_cmd: - if ((!boot_cpu_has_bug(X86_BUG_SRSO) || srso_cmd == SRSO_CMD_OFF) && - boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 8f86eacf69..c131c412db 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.llc_id = c->topo.die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } else { /* * LLC ID is calculated from the number of threads sharing the @@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) if (num_sharing_cache) { int bits = get_count_order(num_sharing_cache); - per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; + c->topo.llc_id = c->topo.apicid >> bits; } } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. */ - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + c->topo.llc_id = c->topo.apicid >> 3; } void init_amd_cacheinfo(struct cpuinfo_x86 *c) @@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP - unsigned int cpu = c->cpu_index; -#endif if (c->cpuid_level > 3) { static int is_initialized; @@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid & ~((1 << index_msb) - 1); + l2_id = c->topo.apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l3_id = c->apicid & ~((1 << index_msb) - 1); + l3_id = c->topo.apicid & ~((1 << index_msb) - 1); break; default: break; @@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l2_id; - per_cpu(cpu_l2c_id, cpu) = l2_id; -#endif + c->topo.llc_id = l2_id; + c->topo.l2c_id = l2_id; } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_SMP - per_cpu(cpu_llc_id, cpu) = l3_id; -#endif + c->topo.llc_id = l3_id; } -#ifdef CONFIG_SMP /* - * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * If llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in * cpuid1). Since cpuid2 doesn't specify shared caches, and we know * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->phys_proc_id. + * c->topo.pkg_id. */ - if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; -#endif + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); @@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, unsigned int apicid, nshared, first, last; nshared = base->eax.split.num_threads_sharing + 1; - apicid = cpu_data(cpu).apicid; + apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if (!this_cpu_ci->info_list) continue; - apicid = cpu_data(i).apicid; + apicid = cpu_data(i).topo.apicid; if ((apicid < first) || (apicid > last)) continue; this_leaf = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { - apicid = cpu_data(sibling).apicid; + apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; cpumask_set_cpu(sibling, @@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, index_msb = get_count_order(num_threads_sharing); for_each_online_cpu(i) - if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); if (i == cpu || !sib_cpu_ci->info_list) @@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->apicid >> index_msb; + id4_regs->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e5ffc8b0e..98f7ea6b93 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -62,6 +62,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> +#include <asm/ia32.h> #include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> @@ -74,18 +75,6 @@ u32 elf_hwcap2 __read_mostly; int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); -/* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; - -u16 get_llc_id(unsigned int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(get_llc_id); - -/* L2 cache ID of each logical CPU */ -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; - static struct ppin_info { int feature; int msr_ppin_ctl; @@ -914,7 +903,7 @@ void detect_ht(struct cpuinfo_x86 *c) return; index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -922,8 +911,8 @@ void detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & + ((1 << core_bits) - 1); #endif } @@ -1114,18 +1103,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + bool vp_bits_from_cpuid = true; - if (c->extended_cpuid_level >= 0x80000008) { + if (!cpu_has(c, X86_FEATURE_CPUID) || + (c->extended_cpuid_level < 0x80000008)) + vp_bits_from_cpuid = false; + + if (vp_bits_from_cpuid) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + } else { + if (IS_ENABLED(CONFIG_X86_64)) { + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; + } else { + c->x86_clflush_size = 32; + c->x86_virt_bits = 32; + c->x86_phys_bits = 32; + + if (cpu_has(c, X86_FEATURE_PAE) || + cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; + } } -#ifdef CONFIG_X86_32 - else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) - c->x86_phys_bits = 36; -#endif c->x86_cache_bits = c->x86_phys_bits; + c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1579,17 +1584,6 @@ static void __init cpu_parse_early_param(void) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; - c->x86_phys_bits = 36; - c->x86_virt_bits = 48; -#else - c->x86_clflush_size = 32; - c->x86_phys_bits = 32; - c->x86_virt_bits = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; @@ -1601,7 +1595,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); get_cpu_cap(c); - get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); cpu_parse_early_param(); @@ -1617,6 +1610,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_CPUID); } + get_cpu_address_sizes(c); + setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1761,15 +1756,15 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_address_sizes(c); if (c->cpuid_level >= 0x00000001) { - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; + c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 # ifdef CONFIG_SMP - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); # else - c->apicid = c->initial_apicid; + c->topo.apicid = c->topo.initial_apicid; # endif #endif - c->phys_proc_id = c->initial_apicid; + c->topo.pkg_id = c->topo.initial_apicid; } get_model_name(c); /* Default name */ @@ -1799,18 +1794,19 @@ static void generic_identify(struct cpuinfo_x86 *c) static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int apicid, cpu = smp_processor_id(); + unsigned int cpu = smp_processor_id(); + u32 apicid; apicid = apic->cpu_present_to_apicid(cpu); - if (apicid != c->apicid) { + if (apicid != c->topo.apicid) { pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->initial_apicid); + cpu, apicid, c->topo.initial_apicid); } - BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); - BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); + BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); + BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); #else - c->logical_proc_id = 0; + c->topo.logical_pkg_id = 0; #endif } @@ -1829,7 +1825,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_coreid_bits = 0; - c->cu_id = 0xff; + c->topo.cu_id = 0xff; + c->topo.llc_id = BAD_APICID; + c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1855,9 +1853,16 @@ static void identify_cpu(struct cpuinfo_x86 *c) apply_forced_caps(c); #ifdef CONFIG_X86_64 - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); #endif + + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -2074,24 +2079,24 @@ void syscall_init(void) wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -#ifdef CONFIG_IA32_EMULATION - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); - /* - * This only works on Intel CPUs. - * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. - * This does not cause SYSENTER to jump to the wrong location, because - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); -#else - wrmsrl_cstar((unsigned long)ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -#endif + if (ia32_enabled()) { + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + } else { + wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + } /* * Flags to clear on syscall; clear as much as possible @@ -2166,8 +2171,6 @@ static inline void setup_getcpu(int cpu) } #ifdef CONFIG_X86_64 -static inline void ucode_cpu_init(int cpu) { } - static inline void tss_setup_ist(struct tss_struct *tss) { /* Set up the per-CPU TSS IST stacks */ @@ -2178,16 +2181,8 @@ static inline void tss_setup_ist(struct tss_struct *tss) /* Only mapped when SEV-ES is active */ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); } - #else /* CONFIG_X86_64 */ - -static inline void ucode_cpu_init(int cpu) -{ - show_ucode_info_early(); -} - static inline void tss_setup_ist(struct tss_struct *tss) { } - #endif /* !CONFIG_X86_64 */ static inline void tss_setup_io_bitmap(struct tss_struct *tss) @@ -2243,8 +2238,6 @@ void cpu_init(void) struct task_struct *cur = current; int cpu = raw_smp_processor_id(); - ucode_cpu_init(cpu); - #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1dcd7d4e38..885281ae79 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c new file mode 100644 index 0000000000..0c179d684b --- /dev/null +++ b/arch/x86/kernel/cpu/debugfs.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include <asm/apic.h> +#include <asm/processor.h> + +static int cpu_debug_show(struct seq_file *m, void *p) +{ + unsigned long cpu = (unsigned long)m->private; + struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu); + + seq_printf(m, "online: %d\n", cpu_online(cpu)); + if (!c->initialized) + return 0; + + seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); + seq_printf(m, "die_id: %u\n", c->topo.die_id); + seq_printf(m, "cu_id: %u\n", c->topo.cu_id); + seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "max_cores: %u\n", c->x86_max_cores); + seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); + seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + return 0; +} + +static int cpu_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpu_debug_show, inode->i_private); +} + +static const struct file_operations dfs_cpu_ops = { + .open = cpu_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int cpu_init_debugfs(void) +{ + struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); + unsigned long id; + char name[24]; + + dir = debugfs_create_dir("cpus", base); + for_each_possible_cpu(id) { + sprintf(name, "%lu", id); + debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops); + } + return 0; +} +late_initcall(cpu_init_debugfs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index a7b3ef4c4d..f0cd95502f 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - int cpu = smp_processor_id(); - /* get information required for multi-node processors */ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int err; @@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - c->cpu_die_id = ecx & 0xff; + c->topo.die_id = ecx & 0xff; - c->cpu_core_id = ebx & 0xff; + c->topo.core_id = ebx & 0xff; if (smp_num_siblings > 1) c->x86_max_cores /= smp_num_siblings; @@ -92,16 +90,15 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) * when running on host. */ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) - c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu); + cacheinfo_hygon_init_llc_id(c); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - c->cpu_die_id = value & 7; - - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; + c->topo.die_id = value & 7; + c->topo.llc_id = c->topo.die_id; } else return; @@ -116,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) static void hygon_detect_cmp(struct cpuinfo_x86 *c) { unsigned int bits; - int cpu = smp_processor_id(); bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; + c->topo.pkg_id = c->topo.initial_apicid >> bits; + /* Use package ID also for last level cache */ + c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -132,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; - unsigned int apicid = c->apicid; + unsigned int apicid = c->topo.apicid; node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = per_cpu(cpu_llc_id, cpu); + node = c->topo.llc_id; /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -165,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) * through CPU mapping may alter the outcome, directly * access __apicid_to_node[]. */ - int ht_nodeid = c->initial_apicid; + int ht_nodeid = c->topo.initial_apicid; if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; @@ -294,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c) static void init_hygon(struct cpuinfo_x86 *c) { + u64 vm_cr; + early_init_hygon(c); /* @@ -305,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = read_apic_id(); + c->topo.apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT @@ -324,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); + if (cpu_has(c, X86_FEATURE_SVM)) { + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { + pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); + clear_cpu_cap(c, X86_FEATURE_SVM); + } + } + if (cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which @@ -348,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be4045628f..a927a8fc96 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -314,19 +314,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_PGE); } - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - check_memory_type_self_snoop_errata(c); /* @@ -1016,7 +1003,6 @@ static struct ctl_table sld_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sld_mitigate_sysctl_init(void) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c267f43de3..f3517b8a8e 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -bool amd_mce_is_memory_error(struct mce *m) +/* + * DRAM ECC errors are reported in the Northbridge (bank 4) with + * Extended Error Code 8. + */ +static bool legacy_mce_is_memory_error(struct mce *m) +{ + return m->bank == 4 && XEC(m->status, 0x1f) == 8; +} + +/* + * DRAM ECC errors are reported in Unified Memory Controllers with + * Extended Error Code 0. + */ +static bool smca_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + + if (XEC(m->status, 0x3f)) + return false; bank_type = smca_get_bank_type(m->extcpu, m->bank); + + return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2; +} + +bool amd_mce_is_memory_error(struct mce *m) +{ if (mce_flags.smca) - return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; + return smca_mce_is_memory_error(m); + else + return legacy_mce_is_memory_error(m); +} + +/* + * AMD systems do not have an explicit indicator that the value in MCA_ADDR is + * a system physical address. Therefore, individual cases need to be detected. + * Future cases and checks will be added as needed. + * + * 1) General case + * a) Assume address is not usable. + * 2) Poison errors + * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy + * northbridge (bank 4). + * b) Refers to poison consumption in the core. Does not include "no action", + * "action optional", or "deferred" error severities. + * c) Will include a usable address so that immediate action can be taken. + * 3) Northbridge DRAM ECC errors + * a) Reported in legacy bank 4 with extended error code (XEC) 8. + * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, + * this bit should not be checked. + * + * NOTE: SMCA UMC memory errors fall into case #1. + */ +bool amd_mce_usable_address(struct mce *m) +{ + /* Check special northbridge case 3) first. */ + if (!mce_flags.smca) { + if (legacy_mce_is_memory_error(m)) + return true; + else if (m->bank == 4) + return false; + } - return m->bank == 4 && xec == 0x8; + /* Check poison bit for all other bank types. */ + if (m->status & MCI_STATUS_POISON) + return true; + + /* Assume address is not usable for all others. */ + return false; } static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 8ed3417146..7f7309ff67 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) m.socketid = -1; for_each_possible_cpu(cpu) { - if (cpu_data(cpu).initial_apicid == lapic_id) { + if (cpu_data(cpu).topo.initial_apicid == lapic_id) { m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).phys_proc_id; + m.socketid = cpu_data(m.extcpu).topo.pkg_id; break; } } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc..df8d25e744 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,6 +44,7 @@ #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> +#include <linux/kexec.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -123,8 +124,8 @@ void mce_setup(struct mce *m) m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).phys_proc_id; - m->apicid = cpu_data(m->extcpu).initial_apicid; + m->socketid = cpu_data(m->extcpu).topo.pkg_id; + m->apicid = cpu_data(m->extcpu).topo.initial_apicid; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); m->ppin = cpu_data(m->extcpu).ppin; m->microcode = boot_cpu_data.microcode; @@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + struct page *p; /* * Allow instrumentation around external facilities usage. Not that it @@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -453,32 +469,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; + return false; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + return amd_mce_usable_address(m); - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f5323551c1..52bce533dd 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m) return false; } + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +bool intel_mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV)) + return false; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return false; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return false; + + return true; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c..e13a26c9c0 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -49,6 +49,7 @@ void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); +bool intel_mce_usable_address(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } +static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +bool amd_mce_usable_address(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index bbd1dc38ea..13b45b9c80 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -37,6 +37,16 @@ #include "internal.h" +struct ucode_patch { + struct list_head plist; + void *data; + unsigned int size; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(microcode_cache); + #define UCODE_MAGIC 0x00414d44 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 @@ -94,8 +104,6 @@ struct cont_desc { size_t size; }; -static u32 ucode_new_rev; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/arch/x86/microcode.rst @@ -121,24 +129,20 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) /* * Check whether there is a valid microcode container file at the beginning - * of @buf of size @buf_size. Set @early to use this function in the early path. + * of @buf of size @buf_size. */ -static bool verify_container(const u8 *buf, size_t buf_size, bool early) +static bool verify_container(const u8 *buf, size_t buf_size) { u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - if (!early) - pr_debug("Truncated microcode container header.\n"); - + pr_debug("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - if (!early) - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); - + pr_debug("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -147,23 +151,20 @@ static bool verify_container(const u8 *buf, size_t buf_size, bool early) /* * Check whether there is a valid, non-truncated CPU equivalence table at the - * beginning of @buf of size @buf_size. Set @early to use this function in the - * early path. + * beginning of @buf of size @buf_size. */ -static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early) +static bool verify_equivalence_table(const u8 *buf, size_t buf_size) { const u32 *hdr = (const u32 *)buf; u32 cont_type, equiv_tbl_len; - if (!verify_container(buf, buf_size, early)) + if (!verify_container(buf, buf_size)) return false; cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - if (!early) - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); - + pr_debug("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -172,9 +173,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - if (!early) - pr_debug("Truncated equivalence table.\n"); - + pr_debug("Truncated equivalence table.\n"); return false; } @@ -183,22 +182,19 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early) /* * Check whether there is a valid, non-truncated microcode patch section at the - * beginning of @buf of size @buf_size. Set @early to use this function in the - * early path. + * beginning of @buf of size @buf_size. * * On success, @sh_psize returns the patch size according to the section header, * to the caller. */ static bool -__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early) +__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) { u32 p_type, p_size; const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - if (!early) - pr_debug("Truncated patch section.\n"); - + pr_debug("Truncated patch section.\n"); return false; } @@ -207,17 +203,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - if (!early) - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); - + pr_debug("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - if (!early) - pr_debug("Patch of size %u too short.\n", p_size); - + pr_debug("Patch of size %u too short.\n", p_size); return false; } @@ -269,7 +261,7 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size * 0: success */ static int -verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early) +verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) { struct microcode_header_amd *mc_hdr; unsigned int ret; @@ -277,7 +269,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea u16 proc_id; u8 patch_fam; - if (!__verify_patch_section(buf, buf_size, &sh_psize, early)) + if (!__verify_patch_section(buf, buf_size, &sh_psize)) return -1; /* @@ -292,16 +284,13 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - if (!early) - pr_debug("Patch of size %u truncated.\n", sh_psize); - + pr_debug("Patch of size %u truncated.\n", sh_psize); return -1; } ret = __verify_patch_size(family, sh_psize, buf_size); if (!ret) { - if (!early) - pr_debug("Per-family patch size mismatch.\n"); + pr_debug("Per-family patch size mismatch.\n"); return -1; } @@ -309,8 +298,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE); if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { - if (!early) - pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id); + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id); return -1; } @@ -337,7 +325,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u16 eq_id; u8 *buf; - if (!verify_equivalence_table(ucode, size, true)) + if (!verify_equivalence_table(ucode, size)) return 0; buf = ucode; @@ -364,7 +352,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u32 patch_size; int ret; - ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true); + ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size); if (ret < 0) { /* * Patch verification failed, skip to the next container, if @@ -452,19 +440,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) +static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; - u32 rev, dummy, *new_rev; bool ret = false; -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); -#else - new_rev = &ucode_new_rev; -#endif - desc.cpuid_1_eax = cpuid_1_eax; scan_containers(ucode, size, &desc); @@ -473,22 +454,15 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) if (!mc) return ret; - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* * Allow application of the same revision to pick up SMT-specific * changes even if the revision of the other SMT thread is already * up-to-date. */ - if (rev > mc->hdr.patch_id) + if (old_rev > mc->hdr.patch_id) return ret; - if (!__apply_microcode_amd(mc)) { - *new_rev = mc->hdr.patch_id; - ret = true; - } - - return ret; + return !__apply_microcode_amd(mc); } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -501,7 +475,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) if (family >= 0x15) snprintf(fw_name, sizeof(fw_name), - "amd-ucode/microcode_amd_fam%.2xh.bin", family); + "amd-ucode/microcode_amd_fam%02hhxh.bin", family); if (firmware_request_builtin(&fw, fw_name)) { cp->size = fw.size; @@ -512,57 +486,48 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) return false; } -static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) { - struct ucode_cpu_info *uci; struct cpio_data cp; - const char *path; - bool use_pa; - - if (IS_ENABLED(CONFIG_X86_32)) { - uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); - path = (const char *)__pa_nodebug(ucode_path); - use_pa = true; - } else { - uci = ucode_cpu_info; - path = ucode_path; - use_pa = false; - } if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) - cp = find_microcode_in_initrd(path, use_pa); - - /* Needed in load_microcode_amd() */ - uci->cpu_sig.sig = cpuid_1_eax; + cp = find_microcode_in_initrd(ucode_path); *ret = cp; } -static void apply_ucode_from_containers(unsigned int cpuid_1_eax) +void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { struct cpio_data cp = { }; + u32 dummy; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); + + /* Needed in load_microcode_amd() */ + ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size); -} - -void load_ucode_amd_early(unsigned int cpuid_1_eax) -{ - return apply_ucode_from_containers(cpuid_1_eax); + if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) +static int __init save_microcode_in_initrd(void) { + unsigned int cpuid_1_eax = native_cpuid_eax(1); + struct cpuinfo_x86 *c = &boot_cpu_data; struct cont_desc desc = { 0 }; enum ucode_state ret; struct cpio_data cp; - cp = find_microcode_in_initrd(ucode_path, false); + if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + return 0; + + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return -EINVAL; @@ -578,6 +543,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return 0; } +early_initcall(save_microcode_in_initrd); /* * a small, trivial cache of per-family ucode patches @@ -631,7 +597,6 @@ static struct ucode_patch *find_patch(unsigned int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u16 equiv_id; - equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); if (!equiv_id) return NULL; @@ -654,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } + if (!__apply_microcode_amd(mc)) + pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); } } @@ -678,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) if (p && (p->patch_id == csig->rev)) uci->mc = p->data; - pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); - return 0; } @@ -720,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu) rev = mc_amd->hdr.patch_id; ret = UCODE_UPDATED; - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - out: uci->cpu_sig.rev = rev; c->microcode = rev; @@ -733,12 +692,20 @@ out: return ret; } +void load_ucode_amd_ap(unsigned int cpuid_1_eax) +{ + unsigned int cpu = smp_processor_id(); + + ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax; + apply_microcode_amd(cpu); +} + static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) { u32 equiv_tbl_len; const u32 *hdr; - if (!verify_equivalence_table(buf, buf_size, false)) + if (!verify_equivalence_table(buf, buf_size)) return 0; hdr = (const u32 *)buf; @@ -784,7 +751,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, u16 proc_id; int ret; - ret = verify_patch(family, fw, leftover, patch_size, false); + ret = verify_patch(family, fw, leftover, patch_size); if (ret) return ret; @@ -909,6 +876,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; + if (force_minrev) + return UCODE_NFOUND; + if (c->x86 >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -918,7 +888,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) } ret = UCODE_ERROR; - if (!verify_container(fw->data, fw->size, false)) + if (!verify_container(fw->data, fw->size)) goto fw_release; ret = load_microcode_amd(c->x86, fw->data, fw->size); @@ -938,10 +908,11 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .request_microcode_fw = request_microcode_amd, - .collect_cpu_info = collect_cpu_info_amd, - .apply_microcode = apply_microcode_amd, - .microcode_fini_cpu = microcode_fini_cpu_amd, + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, + .nmi_safe = true, }; struct microcode_ops * __init init_amd_microcode(void) @@ -952,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void) pr_warn("AMD CPU family 0x%x not supported\n", c->x86); return NULL; } - - if (ucode_new_rev) - pr_info_once("microcode updated early to new patch_level=0x%08x\n", - ucode_new_rev); - return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index a4ebd5e0ae..232026a239 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -23,6 +23,7 @@ #include <linux/miscdevice.h> #include <linux/capability.h> #include <linux/firmware.h> +#include <linux/cpumask.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/mutex.h> @@ -31,6 +32,7 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/perf_event.h> #include <asm/processor.h> @@ -39,14 +41,11 @@ #include "internal.h" -#define DRIVER_VERSION "2.2" - static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = true; - -bool initrd_gone; +bool dis_ucode_ldr = true; -LIST_HEAD(microcode_cache); +bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); +module_param(force_minrev, bool, S_IRUSR | S_IWUSR); /* * Synchronization. @@ -76,6 +75,8 @@ static u32 final_levels[] = { 0, /* T-101 terminator */ }; +struct early_load_data early_data; + /* * Check the current patch level on this CPU. * @@ -90,10 +91,7 @@ static bool amd_check_current_patch_level(void) native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); - if (IS_ENABLED(CONFIG_X86_32)) - levels = (u32 *)__pa_nodebug(&final_levels); - else - levels = final_levels; + levels = final_levels; for (i = 0; levels[i]; i++) { if (lvl == levels[i]) @@ -105,17 +103,8 @@ static bool amd_check_current_patch_level(void) static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; - -#ifdef CONFIG_X86_32 - const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *option = (const char *)__pa_nodebug(__dis_opt_str); - bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); - -#else /* CONFIG_X86_64 */ const char *cmdline = boot_command_line; const char *option = __dis_opt_str; - bool *res = &dis_ucode_ldr; -#endif /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not @@ -123,17 +112,17 @@ static bool __init check_loader_disabled_bsp(void) * that's good enough as they don't land on the BSP path anyway. */ if (native_cpuid_ecx(1) & BIT(31)) - return *res; + return true; if (x86_cpuid_vendor() == X86_VENDOR_AMD) { if (amd_check_current_patch_level()) - return *res; + return true; } if (cmdline_find_option_bool(cmdline, option) <= 0) - *res = false; + dis_ucode_ldr = false; - return *res; + return dis_ucode_ldr; } void __init load_ucode_bsp(void) @@ -166,25 +155,16 @@ void __init load_ucode_bsp(void) return; if (intel) - load_ucode_intel_bsp(); + load_ucode_intel_bsp(&early_data); else - load_ucode_amd_early(cpuid_1_eax); -} - -static bool check_loader_disabled_ap(void) -{ -#ifdef CONFIG_X86_32 - return *((bool *)__pa_nodebug(&dis_ucode_ldr)); -#else - return dis_ucode_ldr; -#endif + load_ucode_amd_bsp(&early_data, cpuid_1_eax); } void load_ucode_ap(void) { unsigned int cpuid_1_eax; - if (check_loader_disabled_ap()) + if (dis_ucode_ldr) return; cpuid_1_eax = native_cpuid_eax(1); @@ -196,103 +176,44 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_early(cpuid_1_eax); + load_ucode_amd_ap(cpuid_1_eax); break; default: break; } } -static int __init save_microcode_in_initrd(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - int ret = -EINVAL; - - if (dis_ucode_ldr) { - ret = 0; - goto out; - } - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - if (c->x86 >= 6) - ret = save_microcode_in_initrd_intel(); - break; - case X86_VENDOR_AMD: - if (c->x86 >= 0x10) - ret = save_microcode_in_initrd_amd(cpuid_eax(1)); - break; - default: - break; - } - -out: - initrd_gone = true; - - return ret; -} - -struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +struct cpio_data __init find_microcode_in_initrd(const char *path) { #ifdef CONFIG_BLK_DEV_INITRD unsigned long start = 0; size_t size; #ifdef CONFIG_X86_32 - struct boot_params *params; - - if (use_pa) - params = (struct boot_params *)__pa_nodebug(&boot_params); - else - params = &boot_params; - - size = params->hdr.ramdisk_size; - - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ + size = boot_params.hdr.ramdisk_size; + /* Early load on BSP has a temporary mapping. */ if (size) - start = params->hdr.ramdisk_image; + start = initrd_start_early; -# else /* CONFIG_X86_64 */ +#else /* CONFIG_X86_64 */ size = (unsigned long)boot_params.ext_ramdisk_size << 32; size |= boot_params.hdr.ramdisk_size; if (size) { start = (unsigned long)boot_params.ext_ramdisk_image << 32; start |= boot_params.hdr.ramdisk_image; - start += PAGE_OFFSET; } -# endif +#endif /* * Fixup the start address: after reserve_initrd() runs, initrd_start * has the virtual address of the beginning of the initrd. It also * possibly relocates the ramdisk. In either case, initrd_start contains * the updated address so use that instead. - * - * initrd_gone is for the hotplug case where we've thrown out initrd - * already. */ - if (!use_pa) { - if (initrd_gone) - return (struct cpio_data){ NULL, 0, "" }; - if (initrd_start) - start = initrd_start; - } else { - /* - * The picture with physical addresses is a bit different: we - * need to get the *physical* address to which the ramdisk was - * relocated, i.e., relocated_ramdisk (not initrd_start) and - * since we're running from physical addresses, we need to access - * relocated_ramdisk through its *physical* address too. - */ - u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); - if (*rr) - start = *rr; - } + if (initrd_start) + start = initrd_start; return find_cpio_data(path, (void *)start, size, NULL); #else /* !CONFIG_BLK_DEV_INITRD */ @@ -336,117 +257,298 @@ static struct platform_device *microcode_pdev; * requirement can be relaxed in the future. Right now, this is conservative * and good. */ -#define SPINUNIT 100 /* 100 nsec */ +enum sibling_ctrl { + /* Spinwait with timeout */ + SCTRL_WAIT, + /* Invoke the microcode_apply() callback */ + SCTRL_APPLY, + /* Proceed without invoking the microcode_apply() callback */ + SCTRL_DONE, +}; + +struct microcode_ctrl { + enum sibling_ctrl ctrl; + enum ucode_state result; + unsigned int ctrl_cpu; + bool nmi_enabled; +}; -static int check_online_cpus(void) +DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable); +static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl); +static atomic_t late_cpus_in, offline_in_nmi; +static unsigned int loops_per_usec; +static cpumask_t cpu_offline_mask; + +static noinstr bool wait_for_cpus(atomic_t *cnt) { - unsigned int cpu; + unsigned int timeout, loops; - /* - * Make sure all CPUs are online. It's fine for SMT to be disabled if - * all the primary threads are still online. - */ - for_each_present_cpu(cpu) { - if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { - pr_err("Not all CPUs online, aborting microcode update.\n"); - return -EINVAL; + WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0); + + for (timeout = 0; timeout < USEC_PER_SEC; timeout++) { + if (!raw_atomic_read(cnt)) + return true; + + for (loops = 0; loops < loops_per_usec; loops++) + cpu_relax(); + + /* If invoked directly, tickle the NMI watchdog */ + if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) { + instrumentation_begin(); + touch_nmi_watchdog(); + instrumentation_end(); } } - - return 0; + /* Prevent the late comers from making progress and let them time out */ + raw_atomic_inc(cnt); + return false; } -static atomic_t late_cpus_in; -static atomic_t late_cpus_out; - -static int __wait_for_cpus(atomic_t *t, long long timeout) +static noinstr bool wait_for_ctrl(void) { - int all_cpus = num_online_cpus(); + unsigned int timeout, loops; - atomic_inc(t); - - while (atomic_read(t) < all_cpus) { - if (timeout < SPINUNIT) { - pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", - all_cpus - atomic_read(t)); - return 1; - } + for (timeout = 0; timeout < USEC_PER_SEC; timeout++) { + if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT) + return true; - ndelay(SPINUNIT); - timeout -= SPINUNIT; + for (loops = 0; loops < loops_per_usec; loops++) + cpu_relax(); - touch_nmi_watchdog(); + /* If invoked directly, tickle the NMI watchdog */ + if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) { + instrumentation_begin(); + touch_nmi_watchdog(); + instrumentation_end(); + } } - return 0; + return false; } /* - * Returns: - * < 0 - on error - * 0 - success (no update done or microcode was updated) + * Protected against instrumentation up to the point where the primary + * thread completed the update. See microcode_nmi_handler() for details. */ -static int __reload_late(void *info) +static noinstr bool load_secondary_wait(unsigned int ctrl_cpu) { - int cpu = smp_processor_id(); - enum ucode_state err; - int ret = 0; + /* Initial rendezvous to ensure that all CPUs have arrived */ + if (!wait_for_cpus(&late_cpus_in)) { + raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT); + return false; + } /* - * Wait for all CPUs to arrive. A load will not be attempted unless all - * CPUs show up. - * */ - if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) - return -1; + * Wait for primary threads to complete. If one of them hangs due + * to the update, there is no way out. This is non-recoverable + * because the CPU might hold locks or resources and confuse the + * scheduler, watchdogs etc. There is no way to safely evacuate the + * machine. + */ + if (wait_for_ctrl()) + return true; + + instrumentation_begin(); + panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu); + instrumentation_end(); +} + +/* + * Protected against instrumentation up to the point where the primary + * thread completed the update. See microcode_nmi_handler() for details. + */ +static noinstr void load_secondary(unsigned int cpu) +{ + unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu); + enum ucode_state ret; + + if (!load_secondary_wait(ctrl_cpu)) { + instrumentation_begin(); + pr_err_once("load: %d CPUs timed out\n", + atomic_read(&late_cpus_in) - 1); + instrumentation_end(); + return; + } + /* Primary thread completed. Allow to invoke instrumentable code */ + instrumentation_begin(); /* - * On an SMT system, it suffices to load the microcode on one sibling of - * the core because the microcode engine is shared between the threads. - * Synchronization still needs to take place so that no concurrent - * loading attempts happen on multiple threads of an SMT core. See - * below. + * If the primary succeeded then invoke the apply() callback, + * otherwise copy the state from the primary thread. */ - if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) - err = microcode_ops->apply_microcode(cpu); + if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY) + ret = microcode_ops->apply_microcode(cpu); else - goto wait_for_siblings; + ret = per_cpu(ucode_ctrl.result, ctrl_cpu); - if (err >= UCODE_NFOUND) { - if (err == UCODE_ERROR) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); - ret = -1; - } + this_cpu_write(ucode_ctrl.result, ret); + this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE); + instrumentation_end(); +} + +static void __load_primary(unsigned int cpu) +{ + struct cpumask *secondaries = topology_sibling_cpumask(cpu); + enum sibling_ctrl ctrl; + enum ucode_state ret; + unsigned int sibling; + + /* Initial rendezvous to ensure that all CPUs have arrived */ + if (!wait_for_cpus(&late_cpus_in)) { + this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT); + pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1); + return; } -wait_for_siblings: - if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) - panic("Timeout during microcode update!\n"); + ret = microcode_ops->apply_microcode(cpu); + this_cpu_write(ucode_ctrl.result, ret); + this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE); /* - * At least one thread has completed update on each core. - * For others, simply call the update to make sure the - * per-cpu cpuinfo can be updated with right microcode - * revision. + * If the update was successful, let the siblings run the apply() + * callback. If not, tell them it's done. This also covers the + * case where the CPU has uniform loading at package or system + * scope implemented but does not advertise it. */ - if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) - err = microcode_ops->apply_microcode(cpu); + if (ret == UCODE_UPDATED || ret == UCODE_OK) + ctrl = SCTRL_APPLY; + else + ctrl = SCTRL_DONE; + + for_each_cpu(sibling, secondaries) { + if (sibling != cpu) + per_cpu(ucode_ctrl.ctrl, sibling) = ctrl; + } +} + +static bool kick_offline_cpus(unsigned int nr_offl) +{ + unsigned int cpu, timeout; + + for_each_cpu(cpu, &cpu_offline_mask) { + /* Enable the rendezvous handler and send NMI */ + per_cpu(ucode_ctrl.nmi_enabled, cpu) = true; + apic_send_nmi_to_offline_cpu(cpu); + } + + /* Wait for them to arrive */ + for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) { + if (atomic_read(&offline_in_nmi) == nr_offl) + return true; + udelay(1); + } + /* Let the others time out */ + return false; +} + +static void release_offline_cpus(void) +{ + unsigned int cpu; + + for_each_cpu(cpu, &cpu_offline_mask) + per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE; +} + +static void load_primary(unsigned int cpu) +{ + unsigned int nr_offl = cpumask_weight(&cpu_offline_mask); + bool proceed = true; + + /* Kick soft-offlined SMT siblings if required */ + if (!cpu && nr_offl) + proceed = kick_offline_cpus(nr_offl); - return ret; + /* If the soft-offlined CPUs did not respond, abort */ + if (proceed) + __load_primary(cpu); + + /* Unconditionally release soft-offlined SMT siblings if required */ + if (!cpu && nr_offl) + release_offline_cpus(); } /* - * Reload microcode late on all CPUs. Wait for a sec until they - * all gather together. + * Minimal stub rendezvous handler for soft-offlined CPUs which participate + * in the NMI rendezvous to protect against a concurrent NMI on affected + * CPUs. */ -static int microcode_reload_late(void) +void noinstr microcode_offline_nmi_handler(void) { - int old = boot_cpu_data.microcode, ret; + if (!raw_cpu_read(ucode_ctrl.nmi_enabled)) + return; + raw_cpu_write(ucode_ctrl.nmi_enabled, false); + raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE); + raw_atomic_inc(&offline_in_nmi); + wait_for_ctrl(); +} + +static noinstr bool microcode_update_handler(void) +{ + unsigned int cpu = raw_smp_processor_id(); + + if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) { + instrumentation_begin(); + load_primary(cpu); + instrumentation_end(); + } else { + load_secondary(cpu); + } + + instrumentation_begin(); + touch_nmi_watchdog(); + instrumentation_end(); + + return true; +} + +/* + * Protection against instrumentation is required for CPUs which are not + * safe against an NMI which is delivered to the secondary SMT sibling + * while the primary thread updates the microcode. Instrumentation can end + * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI + * which is the opposite of what the NMI rendezvous is trying to achieve. + * + * The primary thread is safe versus instrumentation as the actual + * microcode update handles this correctly. It's only the sibling code + * path which must be NMI safe until the primary thread completed the + * update. + */ +bool noinstr microcode_nmi_handler(void) +{ + if (!raw_cpu_read(ucode_ctrl.nmi_enabled)) + return false; + + raw_cpu_write(ucode_ctrl.nmi_enabled, false); + return microcode_update_handler(); +} + +static int load_cpus_stopped(void *unused) +{ + if (microcode_ops->use_nmi) { + /* Enable the NMI handler and raise NMI */ + this_cpu_write(ucode_ctrl.nmi_enabled, true); + apic->send_IPI(smp_processor_id(), NMI_VECTOR); + } else { + /* Just invoke the handler directly */ + microcode_update_handler(); + } + return 0; +} + +static int load_late_stop_cpus(bool is_safe) +{ + unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0; + unsigned int nr_offl, offline = 0; + int old_rev = boot_cpu_data.microcode; struct cpuinfo_x86 prev_info; - pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); - pr_err("You should switch to early loading, if possible.\n"); + if (!is_safe) { + pr_err("Late microcode loading without minimal revision check.\n"); + pr_err("You should switch to early loading, if possible.\n"); + } - atomic_set(&late_cpus_in, 0); - atomic_set(&late_cpus_out, 0); + atomic_set(&late_cpus_in, num_online_cpus()); + atomic_set(&offline_in_nmi, 0); + loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000); /* * Take a snapshot before the microcode update in order to compare and @@ -454,52 +556,162 @@ static int microcode_reload_late(void) */ store_cpu_caps(&prev_info); - ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (!ret) { - pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n", - old, boot_cpu_data.microcode); - microcode_check(&prev_info); - } else { - pr_info("Reload failed, current microcode revision: 0x%x\n", - boot_cpu_data.microcode); + if (microcode_ops->use_nmi) + static_branch_enable_cpuslocked(µcode_nmi_handler_enable); + + stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask); + + if (microcode_ops->use_nmi) + static_branch_disable_cpuslocked(µcode_nmi_handler_enable); + + /* Analyze the results */ + for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) { + switch (per_cpu(ucode_ctrl.result, cpu)) { + case UCODE_UPDATED: updated++; break; + case UCODE_TIMEOUT: timedout++; break; + case UCODE_OK: siblings++; break; + case UCODE_OFFLINE: offline++; break; + default: failed++; break; + } + } + + if (microcode_ops->finalize_late_load) + microcode_ops->finalize_late_load(!updated); + + if (!updated) { + /* Nothing changed. */ + if (!failed && !timedout) + return 0; + + nr_offl = cpumask_weight(&cpu_offline_mask); + if (offline < nr_offl) { + pr_warn("%u offline siblings did not respond.\n", + nr_offl - atomic_read(&offline_in_nmi)); + return -EIO; + } + pr_err("update failed: %u CPUs failed %u CPUs timed out\n", + failed, timedout); + return -EIO; + } + + if (!is_safe || failed || timedout) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings); + if (failed || timedout) { + pr_err("load incomplete. %u CPUs timed out or failed\n", + num_online_cpus() - (updated + siblings)); + } + pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode); + microcode_check(&prev_info); + + return updated + siblings == num_online_cpus() ? 0 : -EIO; +} + +/* + * This function does two things: + * + * 1) Ensure that all required CPUs which are present and have been booted + * once are online. + * + * To pass this check, all primary threads must be online. + * + * If the microcode load is not safe against NMI then all SMT threads + * must be online as well because they still react to NMIs when they are + * soft-offlined and parked in one of the play_dead() variants. So if a + * NMI hits while the primary thread updates the microcode the resulting + * behaviour is undefined. The default play_dead() implementation on + * modern CPUs uses MWAIT, which is also not guaranteed to be safe + * against a microcode update which affects MWAIT. + * + * As soft-offlined CPUs still react on NMIs, the SMT sibling + * restriction can be lifted when the vendor driver signals to use NMI + * for rendezvous and the APIC provides a mechanism to send an NMI to a + * soft-offlined CPU. The soft-offlined CPUs are then able to + * participate in the rendezvous in a trivial stub handler. + * + * 2) Initialize the per CPU control structure and create a cpumask + * which contains "offline"; secondary threads, so they can be handled + * correctly by a control CPU. + */ +static bool setup_cpus(void) +{ + struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, }; + bool allow_smt_offline; + unsigned int cpu; + + allow_smt_offline = microcode_ops->nmi_safe || + (microcode_ops->use_nmi && apic->nmi_to_offline_cpu); + + cpumask_clear(&cpu_offline_mask); + + for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) { + /* + * Offline CPUs sit in one of the play_dead() functions + * with interrupts disabled, but they still react on NMIs + * and execute arbitrary code. Also MWAIT being updated + * while the offline CPU sits there is not necessarily safe + * on all CPU variants. + * + * Mark them in the offline_cpus mask which will be handled + * by CPU0 later in the update process. + * + * Ensure that the primary thread is online so that it is + * guaranteed that all cores are updated. + */ + if (!cpu_online(cpu)) { + if (topology_is_primary_thread(cpu) || !allow_smt_offline) { + pr_err("CPU %u not online, loading aborted\n", cpu); + return false; + } + cpumask_set_cpu(cpu, &cpu_offline_mask); + per_cpu(ucode_ctrl, cpu) = ctrl; + continue; + } + + /* + * Initialize the per CPU state. This is core scope for now, + * but prepared to take package or system scope into account. + */ + ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu)); + per_cpu(ucode_ctrl, cpu) = ctrl; } + return true; +} - return ret; +static int load_late_locked(void) +{ + if (!setup_cpus()) + return -EBUSY; + + switch (microcode_ops->request_microcode_fw(0, µcode_pdev->dev)) { + case UCODE_NEW: + return load_late_stop_cpus(false); + case UCODE_NEW_SAFE: + return load_late_stop_cpus(true); + case UCODE_NFOUND: + return -ENOENT; + default: + return -EBADFD; + } } static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { - enum ucode_state tmp_ret = UCODE_OK; - int bsp = boot_cpu_data.cpu_index; unsigned long val; - ssize_t ret = 0; + ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret || val != 1) return -EINVAL; cpus_read_lock(); - - ret = check_online_cpus(); - if (ret) - goto put; - - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev); - if (tmp_ret != UCODE_NEW) - goto put; - - ret = microcode_reload_late(); -put: + ret = load_late_locked(); cpus_read_unlock(); - if (ret == 0) - ret = size; - - add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); - - return ret; + return ret ? : size; } static DEVICE_ATTR_WO(reload); @@ -541,17 +753,6 @@ static void microcode_fini_cpu(int cpu) microcode_ops->microcode_fini_cpu(cpu); } -static enum ucode_state microcode_init_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - memset(uci, 0, sizeof(*uci)); - - microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); - - return microcode_ops->apply_microcode(cpu); -} - /** * microcode_bsp_resume - Update boot CPU microcode during resume. */ @@ -570,19 +771,18 @@ static struct syscore_ops mc_syscore_ops = { .resume = microcode_bsp_resume, }; -static int mc_cpu_starting(unsigned int cpu) -{ - enum ucode_state err = microcode_ops->apply_microcode(cpu); - - pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err); - - return err == UCODE_ERROR; -} - static int mc_cpu_online(unsigned int cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct device *dev = get_cpu_device(cpu); + memset(uci, 0, sizeof(*uci)); + + microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); + cpu_data(cpu).microcode = uci->cpu_sig.rev; + if (!cpu) + boot_cpu_data.microcode = uci->cpu_sig.rev; + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); return 0; @@ -590,33 +790,13 @@ static int mc_cpu_online(unsigned int cpu) static int mc_cpu_down_prep(unsigned int cpu) { - struct device *dev; - - dev = get_cpu_device(cpu); + struct device *dev = get_cpu_device(cpu); microcode_fini_cpu(cpu); - - /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("%s: CPU%d\n", __func__, cpu); - return 0; } -static void setup_online_cpu(struct work_struct *work) -{ - int cpu = smp_processor_id(); - enum ucode_state err; - - err = microcode_init_cpu(cpu); - if (err == UCODE_ERROR) { - pr_err("Error applying microcode on CPU%d\n", cpu); - return; - } - - mc_cpu_online(cpu); -} - static struct attribute *cpu_root_microcode_attrs[] = { #ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, @@ -648,6 +828,11 @@ static int __init microcode_init(void) if (!microcode_ops) return -ENODEV; + pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev)); + + if (early_data.new_rev) + pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev); + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); @@ -662,16 +847,9 @@ static int __init microcode_init(void) } } - /* Do per-CPU setup */ - schedule_on_each_cpu(setup_online_cpu); - register_syscore_ops(&mc_syscore_ops); - cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", - mc_cpu_starting, NULL); - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", - mc_cpu_online, mc_cpu_down_prep); - - pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); return 0; @@ -680,5 +858,4 @@ static int __init microcode_init(void) return error; } -fs_initcall(save_microcode_in_initrd); late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 94dd6af9c9..334972c097 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -14,7 +14,6 @@ #include <linux/earlycpio.h> #include <linux/firmware.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/initrd.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -32,11 +31,14 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; +#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL) + /* Current microcode patch used in early patching on the APs. */ -static struct microcode_intel *intel_ucode_patch; +static struct microcode_intel *ucode_patch_va __read_mostly; +static struct microcode_intel *ucode_patch_late __read_mostly; /* last level cache size per core */ -static int llc_size_per_core; +static unsigned int llc_size_per_core __ro_after_init; /* microcode format is extended from prescott processors */ struct extended_signature { @@ -66,60 +68,52 @@ static inline unsigned int exttable_size(struct extended_sigtable *et) return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE; } -int intel_cpu_collect_info(struct ucode_cpu_info *uci) +void intel_collect_cpu_info(struct cpu_signature *sig) { - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; + sig->sig = cpuid_eax(1); + sig->pf = 0; + sig->rev = intel_get_microcode_revision(); - family = x86_family(eax); - model = x86_model(eax); + if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) { + unsigned int val[2]; - if (model >= 5 || family > 6) { /* get processor flags from MSR 0x17 */ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); + sig->pf = 1 << ((val[1] >> 18) & 7); } +} +EXPORT_SYMBOL_GPL(intel_collect_cpu_info); - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; +static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2, + unsigned int pf2) +{ + if (s1->sig != sig2) + return false; - return 0; + /* Processor flags are either both 0 or they intersect. */ + return ((!s1->pf && !pf2) || (s1->pf & pf2)); } -EXPORT_SYMBOL_GPL(intel_cpu_collect_info); -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int intel_find_matching_signature(void *mc, unsigned int csig, int cpf) +bool intel_find_matching_signature(void *mc, struct cpu_signature *sig) { struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; struct extended_signature *ext_sig; + struct extended_sigtable *ext_hdr; int i; - if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; + if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf)) + return true; /* Look for ext. headers: */ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; + return false; ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE; ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; + if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf)) + return true; ext_sig++; } return 0; @@ -240,264 +234,91 @@ int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type) } EXPORT_SYMBOL_GPL(intel_microcode_sanity_check); -/* - * Returns 1 if update has been found, 0 otherwise. - */ -static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) -{ - struct microcode_header_intel *mc_hdr = mc; - - if (mc_hdr->rev <= new_rev) - return 0; - - return intel_find_matching_signature(mc, csig, cpf); -} - -static struct ucode_patch *memdup_patch(void *data, unsigned int size) -{ - struct ucode_patch *p; - - p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); - if (!p) - return NULL; - - p->data = kmemdup(data, size, GFP_KERNEL); - if (!p->data) { - kfree(p); - return NULL; - } - - return p; -} - -static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size) +static void update_ucode_pointer(struct microcode_intel *mc) { - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - struct ucode_patch *iter, *tmp, *p = NULL; - bool prev_found = false; - unsigned int sig, pf; - - mc_hdr = (struct microcode_header_intel *)data; - - list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - mc_saved_hdr = (struct microcode_header_intel *)iter->data; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; - - if (intel_find_matching_signature(data, sig, pf)) { - prev_found = true; - - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; - - p = memdup_patch(data, size); - if (!p) - pr_err("Error allocating buffer %p\n", data); - else { - list_replace(&iter->plist, &p->plist); - kfree(iter->data); - kfree(iter); - } - } - } + kvfree(ucode_patch_va); /* - * There weren't any previous patches found in the list cache; save the - * newly found. + * Save the virtual address for early loading and for eventual free + * on late loading. */ - if (!prev_found) { - p = memdup_patch(data, size); - if (!p) - pr_err("Error allocating buffer for %p\n", data); - else - list_add_tail(&p->plist, µcode_cache); - } - - if (!p) - return; + ucode_patch_va = mc; +} - if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) - return; +static void save_microcode_patch(struct microcode_intel *patch) +{ + unsigned int size = get_totalsize(&patch->hdr); + struct microcode_intel *mc; - /* - * Save for early loading. On 32-bit, that needs to be a physical - * address as the APs are running from physical addresses, before - * paging has been enabled. - */ - if (IS_ENABLED(CONFIG_X86_32)) - intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + mc = kvmemdup(patch, size, GFP_KERNEL); + if (mc) + update_ucode_pointer(mc); else - intel_ucode_patch = p->data; + pr_err("Unable to allocate microcode memory size: %u\n", size); } -/* - * Get microcode matching with BSP's model. Only CPUs with the same model as - * BSP can stay in the platform. - */ -static struct microcode_intel * -scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) +/* Scan blob for microcode matching the boot CPUs family, model, stepping */ +static __init struct microcode_intel *scan_microcode(void *data, size_t size, + struct ucode_cpu_info *uci, + bool save) { struct microcode_header_intel *mc_header; struct microcode_intel *patch = NULL; + u32 cur_rev = uci->cpu_sig.rev; unsigned int mc_size; - while (size) { - if (size < sizeof(struct microcode_header_intel)) - break; - + for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) { mc_header = (struct microcode_header_intel *)data; mc_size = get_totalsize(mc_header); - if (!mc_size || - mc_size > size || + if (!mc_size || mc_size > size || intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0) break; - size -= mc_size; - - if (!intel_find_matching_signature(data, uci->cpu_sig.sig, - uci->cpu_sig.pf)) { - data += mc_size; + if (!intel_find_matching_signature(data, &uci->cpu_sig)) continue; - } + /* + * For saving the early microcode, find the matching revision which + * was loaded on the BSP. + * + * On the BSP during early boot, find a newer revision than + * actually loaded in the CPU. + */ if (save) { - save_microcode_patch(uci, data, mc_size); - goto next; - } - - - if (!patch) { - if (!has_newer_microcode(data, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - uci->cpu_sig.rev)) - goto next; - - } else { - struct microcode_header_intel *phdr = &patch->hdr; - - if (!has_newer_microcode(data, - phdr->sig, - phdr->pf, - phdr->rev)) - goto next; + if (cur_rev != mc_header->rev) + continue; + } else if (cur_rev >= mc_header->rev) { + continue; } - /* We have a newer patch, save it. */ patch = data; - -next: - data += mc_size; - } - - if (size) - return NULL; - - return patch; -} - -static bool load_builtin_intel_microcode(struct cpio_data *cp) -{ - unsigned int eax = 1, ebx, ecx = 0, edx; - struct firmware fw; - char name[30]; - - if (IS_ENABLED(CONFIG_X86_32)) - return false; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - sprintf(name, "intel-ucode/%02x-%02x-%02x", - x86_family(eax), x86_model(eax), x86_stepping(eax)); - - if (firmware_request_builtin(&fw, name)) { - cp->size = fw.size; - cp->data = (void *)fw.data; - return true; + cur_rev = mc_header->rev; } - return false; + return size ? NULL : patch; } -static void print_ucode_info(int old_rev, int new_rev, unsigned int date) +static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, + struct microcode_intel *mc, + u32 *cur_rev) { - pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", - old_rev, - new_rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); -} - -#ifdef CONFIG_X86_32 - -static int delay_ucode_info; -static int current_mc_date; -static int early_old_rev; - -/* - * Print early updated ucode info after printk works. This is delayed info dump. - */ -void show_ucode_info_early(void) -{ - struct ucode_cpu_info uci; - - if (delay_ucode_info) { - intel_cpu_collect_info(&uci); - print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date); - delay_ucode_info = 0; - } -} - -/* - * At this point, we can not call printk() yet. Delay printing microcode info in - * show_ucode_info_early() until printk() works. - */ -static void print_ucode(int old_rev, int new_rev, int date) -{ - int *delay_ucode_info_p; - int *current_mc_date_p; - int *early_old_rev_p; - - delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); - current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); - early_old_rev_p = (int *)__pa_nodebug(&early_old_rev); - - *delay_ucode_info_p = 1; - *current_mc_date_p = date; - *early_old_rev_p = old_rev; -} -#else - -static inline void print_ucode(int old_rev, int new_rev, int date) -{ - print_ucode_info(old_rev, new_rev, date); -} -#endif - -static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) -{ - struct microcode_intel *mc; - u32 rev, old_rev; + u32 rev; - mc = uci->mc; if (!mc) - return 0; + return UCODE_NFOUND; /* * Save us the MSR write below - which is a particular expensive * operation - when the other hyperthread has updated the microcode * already. */ - rev = intel_get_microcode_revision(); - if (rev >= mc->hdr.rev) { - uci->cpu_sig.rev = rev; + *cur_rev = intel_get_microcode_revision(); + if (*cur_rev >= mc->hdr.rev) { + uci->cpu_sig.rev = *cur_rev; return UCODE_OK; } - old_rev = rev; - /* * Writeback and invalidate caches before updating microcode to avoid * internal issues depending on what the microcode is updating. @@ -509,247 +330,179 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) - return -1; + return UCODE_ERROR; uci->cpu_sig.rev = rev; + return UCODE_UPDATED; +} - if (early) - print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date); - else - print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date); +static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc = uci->mc; + u32 cur_rev; - return 0; + return __apply_microcode(uci, mc, &cur_rev); } -int __init save_microcode_in_initrd_intel(void) +static __init bool load_builtin_intel_microcode(struct cpio_data *cp) { - struct ucode_cpu_info uci; - struct cpio_data cp; - - /* - * initrd is going away, clear patch ptr. We will scan the microcode one - * last time before jettisoning and save a patch, if found. Then we will - * update that pointer too, with a stable patch address to use when - * resuming the cores. - */ - intel_ucode_patch = NULL; + unsigned int eax = 1, ebx, ecx = 0, edx; + struct firmware fw; + char name[30]; - if (!load_builtin_intel_microcode(&cp)) - cp = find_microcode_in_initrd(ucode_path, false); + if (IS_ENABLED(CONFIG_X86_32)) + return false; - if (!(cp.data && cp.size)) - return 0; + native_cpuid(&eax, &ebx, &ecx, &edx); - intel_cpu_collect_info(&uci); + sprintf(name, "intel-ucode/%02x-%02x-%02x", + x86_family(eax), x86_model(eax), x86_stepping(eax)); - scan_microcode(cp.data, cp.size, &uci, true); - return 0; + if (firmware_request_builtin(&fw, name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + return false; } -/* - * @res_patch, output: a pointer to the patch we found. - */ -static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save) { - static const char *path; struct cpio_data cp; - bool use_pa; - - if (IS_ENABLED(CONFIG_X86_32)) { - path = (const char *)__pa_nodebug(ucode_path); - use_pa = true; - } else { - path = ucode_path; - use_pa = false; - } - /* try built-in microcode first */ + intel_collect_cpu_info(&uci->cpu_sig); + if (!load_builtin_intel_microcode(&cp)) - cp = find_microcode_in_initrd(path, use_pa); + cp = find_microcode_in_initrd(ucode_path); if (!(cp.data && cp.size)) return NULL; - intel_cpu_collect_info(uci); - - return scan_microcode(cp.data, cp.size, uci, false); + return scan_microcode(cp.data, cp.size, uci, save); } -void __init load_ucode_intel_bsp(void) +/* + * Invoked from an early init call to save the microcode blob which was + * selected during early boot when mm was not usable. The microcode must be + * saved because initrd is going away. It's an early init call so the APs + * just can use the pointer and do not have to scan initrd/builtin firmware + * again. + */ +static int __init save_builtin_microcode(void) { - struct microcode_intel *patch; struct ucode_cpu_info uci; - patch = __load_ucode_intel(&uci); - if (!patch) - return; + if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) + return 0; - uci.mc = patch; + if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; - apply_microcode_early(&uci, true); + uci.mc = get_microcode_blob(&uci, true); + if (uci.mc) + save_microcode_patch(uci.mc); + return 0; } +early_initcall(save_builtin_microcode); -void load_ucode_intel_ap(void) +/* Load microcode on BSP from initrd or builtin blobs */ +void __init load_ucode_intel_bsp(struct early_load_data *ed) { - struct microcode_intel *patch, **iup; struct ucode_cpu_info uci; - if (IS_ENABLED(CONFIG_X86_32)) - iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); - else - iup = &intel_ucode_patch; - - if (!*iup) { - patch = __load_ucode_intel(&uci); - if (!patch) - return; + uci.mc = get_microcode_blob(&uci, false); + ed->old_rev = uci.cpu_sig.rev; - *iup = patch; + if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) { + ucode_patch_va = UCODE_BSP_LOADED; + ed->new_rev = uci.cpu_sig.rev; } - - uci.mc = *iup; - - apply_microcode_early(&uci, true); } -static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) +void load_ucode_intel_ap(void) { - struct microcode_header_intel *phdr; - struct ucode_patch *iter, *tmp; - - list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - - phdr = (struct microcode_header_intel *)iter->data; - - if (phdr->rev <= uci->cpu_sig.rev) - continue; - - if (!intel_find_matching_signature(phdr, - uci->cpu_sig.sig, - uci->cpu_sig.pf)) - continue; + struct ucode_cpu_info uci; - return iter->data; - } - return NULL; + uci.mc = ucode_patch_va; + if (uci.mc) + apply_microcode_early(&uci); } +/* Reload microcode on resume */ void reload_ucode_intel(void) { - struct microcode_intel *p; - struct ucode_cpu_info uci; + struct ucode_cpu_info uci = { .mc = ucode_patch_va, }; - intel_cpu_collect_info(&uci); - - p = find_patch(&uci); - if (!p) - return; - - uci.mc = p; - - apply_microcode_early(&uci, false); + if (uci.mc) + apply_microcode_early(&uci); } static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned int val[2]; - - memset(csig, 0, sizeof(*csig)); - - csig->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig->pf = 1 << ((val[1] >> 18) & 7); - } - - csig->rev = c->microcode; - + intel_collect_cpu_info(csig); return 0; } -static enum ucode_state apply_microcode_intel(int cpu) +static enum ucode_state apply_microcode_late(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpuinfo_x86 *c = &cpu_data(cpu); - bool bsp = c->cpu_index == boot_cpu_data.cpu_index; - struct microcode_intel *mc; + struct microcode_intel *mc = ucode_patch_late; enum ucode_state ret; - static int prev_rev; - u32 rev; + u32 cur_rev; - /* We should bind the task to the CPU */ - if (WARN_ON(raw_smp_processor_id() != cpu)) + if (WARN_ON_ONCE(smp_processor_id() != cpu)) return UCODE_ERROR; - /* Look for a newer patch in our cache: */ - mc = find_patch(uci); - if (!mc) { - mc = uci->mc; - if (!mc) - return UCODE_NFOUND; - } + ret = __apply_microcode(uci, mc, &cur_rev); + if (ret != UCODE_UPDATED && ret != UCODE_OK) + return ret; - /* - * Save us the MSR write below - which is a particular expensive - * operation - when the other hyperthread has updated the microcode - * already. - */ - rev = intel_get_microcode_revision(); - if (rev >= mc->hdr.rev) { - ret = UCODE_OK; - goto out; + if (!cpu && uci->cpu_sig.rev != cur_rev) { + pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); } - /* - * Writeback and invalidate caches before updating microcode to avoid - * internal issues depending on what the microcode is updating. - */ - native_wbinvd(); + cpu_data(cpu).microcode = uci->cpu_sig.rev; + if (!cpu) + boot_cpu_data.microcode = uci->cpu_sig.rev; - /* write microcode via MSR 0x79 */ - wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + return ret; +} - rev = intel_get_microcode_revision(); +static bool ucode_validate_minrev(struct microcode_header_intel *mc_header) +{ + int cur_rev = boot_cpu_data.microcode; - if (rev != mc->hdr.rev) { - pr_err("CPU%d update to revision 0x%x failed\n", - cpu, mc->hdr.rev); - return UCODE_ERROR; + /* + * When late-loading, ensure the header declares a minimum revision + * required to perform a late-load. The previously reserved field + * is 0 in older microcode blobs. + */ + if (!mc_header->min_req_ver) { + pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n"); + return false; } - if (bsp && rev != prev_rev) { - pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", - rev, - mc->hdr.date & 0xffff, - mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); - prev_rev = rev; + /* + * Check whether the current revision is either greater or equal to + * to the minimum revision specified in the header. + */ + if (cur_rev < mc_header->min_req_ver) { + pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev); + pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver); + return false; } - - ret = UCODE_UPDATED; - -out: - uci->cpu_sig.rev = rev; - c->microcode = rev; - - /* Update boot_cpu_data's revision too, if we're on the BSP: */ - if (bsp) - boot_cpu_data.microcode = rev; - - return ret; + return true; } -static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) +static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - unsigned int curr_mc_size = 0, new_mc_size = 0; - enum ucode_state ret = UCODE_OK; - int new_rev = uci->cpu_sig.rev; + bool is_safe, new_is_safe = false; + int cur_rev = uci->cpu_sig.rev; + unsigned int curr_mc_size = 0; u8 *new_mc = NULL, *mc = NULL; - unsigned int csig, cpf; while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; @@ -758,68 +511,66 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { pr_err("error! Truncated or inaccessible header in microcode data file\n"); - break; + goto fail; } mc_size = get_totalsize(&mc_header); if (mc_size < sizeof(mc_header)) { pr_err("error! Bad data in microcode data file (totalsize too small)\n"); - break; + goto fail; } data_size = mc_size - sizeof(mc_header); if (data_size > iov_iter_count(iter)) { pr_err("error! Bad data in microcode data file (truncated file?)\n"); - break; + goto fail; } /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - vfree(mc); - mc = vmalloc(mc_size); + kvfree(mc); + mc = kvmalloc(mc_size, GFP_KERNEL); if (!mc) - break; + goto fail; curr_mc_size = mc_size; } memcpy(mc, &mc_header, sizeof(mc_header)); data = mc + sizeof(mc_header); if (!copy_from_iter_full(data, data_size, iter) || - intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) { - break; - } + intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) + goto fail; - csig = uci->cpu_sig.sig; - cpf = uci->cpu_sig.pf; - if (has_newer_microcode(mc, csig, cpf, new_rev)) { - vfree(new_mc); - new_rev = mc_header.rev; - new_mc = mc; - new_mc_size = mc_size; - mc = NULL; /* trigger new vmalloc */ - ret = UCODE_NEW; - } - } + if (cur_rev >= mc_header.rev) + continue; - vfree(mc); + if (!intel_find_matching_signature(mc, &uci->cpu_sig)) + continue; - if (iov_iter_count(iter)) { - vfree(new_mc); - return UCODE_ERROR; + is_safe = ucode_validate_minrev(&mc_header); + if (force_minrev && !is_safe) + continue; + + kvfree(new_mc); + cur_rev = mc_header.rev; + new_mc = mc; + new_is_safe = is_safe; + mc = NULL; } + if (iov_iter_count(iter)) + goto fail; + + kvfree(mc); if (!new_mc) return UCODE_NFOUND; - vfree(uci->mc); - uci->mc = (struct microcode_intel *)new_mc; - - /* Save for CPU hotplug */ - save_microcode_patch(uci, new_mc, new_mc_size); + ucode_patch_late = (struct microcode_intel *)new_mc; + return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW; - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - - return ret; +fail: + kvfree(mc); + kvfree(new_mc); + return UCODE_ERROR; } static bool is_blacklisted(unsigned int cpu) @@ -868,26 +619,36 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) kvec.iov_base = (void *)firmware->data; kvec.iov_len = firmware->size; iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size); - ret = generic_load_microcode(cpu, &iter); + ret = parse_microcode_blobs(cpu, &iter); release_firmware(firmware); return ret; } +static void finalize_late_load(int result) +{ + if (!result) + update_ucode_pointer(ucode_patch_late); + else + kvfree(ucode_patch_late); + ucode_patch_late = NULL; +} + static struct microcode_ops microcode_intel_ops = { - .request_microcode_fw = request_microcode_fw, - .collect_cpu_info = collect_cpu_info, - .apply_microcode = apply_microcode_intel, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode_late, + .finalize_late_load = finalize_late_load, + .use_nmi = IS_ENABLED(CONFIG_X86_64), }; -static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) { u64 llc_size = c->x86_cache_size * 1024ULL; do_div(llc_size, c->x86_max_cores); - - return (int)llc_size; + llc_size_per_core = (unsigned int)llc_size; } struct microcode_ops * __init init_intel_microcode(void) @@ -900,7 +661,7 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } - llc_size_per_core = calc_llc_size_per_core(c); + calc_llc_size_per_core(c); return µcode_intel_ops; } diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index bf883aa712..21776c529f 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -8,43 +8,43 @@ #include <asm/cpu.h> #include <asm/microcode.h> -struct ucode_patch { - struct list_head plist; - void *data; /* Intel uses only this one */ - unsigned int size; - u32 patch_id; - u16 equiv_cpu; -}; - -extern struct list_head microcode_cache; - struct device; enum ucode_state { UCODE_OK = 0, UCODE_NEW, + UCODE_NEW_SAFE, UCODE_UPDATED, UCODE_NFOUND, UCODE_ERROR, + UCODE_TIMEOUT, + UCODE_OFFLINE, }; struct microcode_ops { enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev); - void (*microcode_fini_cpu)(int cpu); /* - * The generic 'microcode_core' part guarantees that - * the callbacks below run on a target cpu when they - * are being called. + * The generic 'microcode_core' part guarantees that the callbacks + * below run on a target CPU when they are being called. * See also the "Synchronization" section in microcode_core.c. */ - enum ucode_state (*apply_microcode)(int cpu); - int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); + enum ucode_state (*apply_microcode)(int cpu); + int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); + void (*finalize_late_load)(int result); + unsigned int nmi_safe : 1, + use_nmi : 1; +}; + +struct early_load_data { + u32 old_rev; + u32 new_rev; }; +extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; -struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); +struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -94,20 +94,19 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool initrd_gone; +extern bool dis_ucode_ldr; +extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD -void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); -void load_ucode_amd_early(unsigned int cpuid_1_eax); int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ -static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } -static inline void load_ucode_amd_early(unsigned int family) { } static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } @@ -115,15 +114,13 @@ static inline void exit_amd_microcode(void) { } #endif /* !CONFIG_CPU_SUP_AMD */ #ifdef CONFIG_CPU_SUP_INTEL -void load_ucode_intel_bsp(void); +void load_ucode_intel_bsp(struct early_load_data *ed); void load_ucode_intel_ap(void); -int save_microcode_in_initrd_intel(void); void reload_ucode_intel(void); struct microcode_ops *init_intel_microcode(void); #else /* CONFIG_CPU_SUP_INTEL */ -static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_bsp(struct early_load_data *ed) { } static inline void load_ucode_intel_ap(void) { } -static inline int save_microcode_in_initrd_intel(void) { return -EINVAL; } static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e6bba12c75..01fa06dd06 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void) static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) { static atomic_t nmi_cpu = ATOMIC_INIT(-1); + unsigned int old_cpu, this_cpu; if (!unknown_nmi_panic) return NMI_DONE; - if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + old_cpu = -1; + this_cpu = raw_smp_processor_id(); + if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu)) return NMI_HANDLED; return NMI_DONE; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 31c0e68f62..e65fae6366 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id); seq_printf(m, "siblings\t: %d\n", cpumask_weight(topology_core_cpumask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "core id\t\t: %d\n", c->topo.core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); + seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid); + seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid); #endif } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 030d3b4097..19e0681f04 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void) r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; + r->cache.arch_has_sparse_bitmasks = false; r->alloc_capable = true; rdt_alloc_capable = true; @@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_1_eax eax; + union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx; - cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; } @@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { @@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void) if (r->rid == RDT_RESOURCE_L3 || r->rid == RDT_RESOURCE_L2) { - r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_sparse_bitmasks = true; r->cache.arch_has_per_cpu_cfg = true; r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b44c487727..beccb0e87b 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, /* * Check whether a cache bit mask is valid. - * For Intel the SDM says: - * Please note that all (and only) contiguous '1' combinations - * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). - * Additionally Haswell requires at least two bits set. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. * AMD allows non-contiguous bitmasks. */ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) @@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - /* Are non-contiguous bitmaps allowed? */ - if (!r->cache.arch_has_sparse_bitmaps && + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 85ceaf9a31..a4f1aa15f0 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -59,6 +59,7 @@ struct rdt_fs_context { bool enable_cdpl2; bool enable_cdpl3; bool enable_mba_mbps; + bool enable_debug; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -243,18 +244,17 @@ struct rdtgroup { */ #define RFTYPE_INFO BIT(0) #define RFTYPE_BASE BIT(1) -#define RF_CTRLSHIFT 4 -#define RF_MONSHIFT 5 -#define RF_TOPSHIFT 6 -#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) -#define RFTYPE_MON BIT(RF_MONSHIFT) -#define RFTYPE_TOP BIT(RF_TOPSHIFT) +#define RFTYPE_CTRL BIT(4) +#define RFTYPE_MON BIT(5) +#define RFTYPE_TOP BIT(6) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) -#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_DEBUG BIT(10) +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void); * @mode: Access mode * @kf_ops: File operations * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RF_* or RFTYPE_* flags + * @fflags: File specific RFTYPE_* flags * @seq_show: Show content of the file * @write: Write to the file */ @@ -492,6 +492,15 @@ union cpuid_0x10_3_eax { unsigned int full; }; +/* CPUID.(EAX=10H, ECX=ResID).ECX */ +union cpuid_0x10_x_ecx { + struct { + unsigned int reserved:3; + unsigned int noncont:1; + } split; + unsigned int full; +}; + /* CPUID.(EAX=10H, ECX=ResID).EDX */ union cpuid_0x10_x_edx { struct { diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 725344048f..69a1de9238 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata; static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); +static void rdtgroup_destroy_root(void); + struct dentry *debugfs_resctrl; +static bool resctrl_debug; + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + char *pid_str; int ret = 0; pid_t pid; - if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); @@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, goto unlock; } - ret = rdtgroup_move_task(pid, rdtgrp, of); + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } unlock: rdtgroup_kn_unlock(of->kn); @@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + #ifdef CONFIG_PROC_CPU_RESCTRL /* @@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, return 0; } -/** +/* * rdt_bit_usage_show - Display current usage of resources * * A domain is a shared resource that can now be allocated differently. Here @@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) } } +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + /** * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other * @r: Resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. + * @type: CDP type of @r. * @exclusive: Only check if overlaps with exclusive resource groups * * Checks if provided @cbm intended to be used for @closid on domain @@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, /** * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. * * An exclusive resource group implies that there should be no sharing of * its allocated resources. At the time this group is considered to be @@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) return true; } -/** +/* * rdtgroup_mode_write - Modify the resource group's mode - * */ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) @@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, return size; } -/** +/* * rdtgroup_size_show - Display size in bytes of allocated regions * * The "size" file mirrors the layout of the "schemata" file, printing the * size in bytes of each region instead of the capacity bitmask. - * */ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) @@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = { .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_last_cmd_status_show, - .fflags = RF_TOP_INFO, + .fflags = RFTYPE_TOP_INFO, }, { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, - .fflags = RF_CTRL_INFO, + .fflags = RFTYPE_CTRL_INFO, }, { .name = "mon_features", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_mon_features_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "num_rmids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_rmids_show, - .fflags = RF_MON_INFO, + .fflags = RFTYPE_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_shareable_bits_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "bit_usage", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bit_usage_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, - .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, /* * Platform specific which (if any) capabilities are provided by @@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = max_threshold_occ_write, .seq_show = max_threshold_occ_show, - .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { .name = "mbm_total_bytes_config", @@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_BASE, }, { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { .name = "schemata", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_schemata_write, .seq_show = rdtgroup_schemata_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "mode", @@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .write = rdtgroup_mode_write, .seq_show = rdtgroup_mode_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, }, { .name = "size", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdtgroup_size_show, - .fflags = RF_CTRL_BASE, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, }; @@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + for (rft = rfts; rft < rfts + len; rft++) { if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); @@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void) if (!rft) return; - rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; + rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; } void __init mbm_config_rftype_init(const char *config) @@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config) rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; } /** @@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); if (ret) goto out_destroy; /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RF_CTRL_INFO; + fflags = r->fflags | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RF_MON_INFO; + fflags = r->fflags | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -static void cdp_disable_all(void) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -} - /* * We don't allow rdtgroup directories to be created anywhere * except the root directory. Thus when looking for the rdtgroup @@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, struct kernfs_node **mon_data_kn); +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + static int rdt_enable_ctx(struct rdt_fs_context *ctx) { int ret = 0; - if (ctx->enable_cdpl2) + if (ctx->enable_cdpl2) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } - if (!ret && ctx->enable_cdpl3) + if (ctx->enable_cdpl3) { ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } - if (!ret && ctx->enable_mba_mbps) + if (ctx->enable_mba_mbps) { ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: return ret; } @@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void) static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; struct rdt_domain *dom; struct rdt_resource *r; int ret; @@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + ret = rdt_enable_ctx(ctx); - if (ret < 0) - goto out_cdp; + if (ret) + goto out_root; ret = schemata_list_create(); if (ret) { schemata_list_destroy(); - goto out_mba; + goto out_ctx; } closid_init(); + if (rdt_mon_capable) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_schemata_free; + + kernfs_activate(rdtgroup_default.kn); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) goto out_schemata_free; @@ -2543,11 +2668,10 @@ out_info: kernfs_remove(kn_info); out_schemata_free: schemata_list_destroy(); -out_mba: - if (ctx->enable_mba_mbps) - set_mba_sc(false); -out_cdp: - cdp_disable_all(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -2559,6 +2683,7 @@ enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mbps, + Opt_debug, nr__rdt_params }; @@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), {} }; @@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; ctx->enable_mba_mbps = true; return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; } return -EINVAL; @@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - ctx->kfc.root = rdt_root; ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; fc->fs_private = &ctx->kfc; fc->ops = &rdt_fs_context_ops; @@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb) cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - set_mba_sc(false); + rdt_disable_ctx(); /*Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); - cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); + rdtgroup_destroy_root(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, enum rdt_group_type rtype, struct rdtgroup **r) { struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; struct kernfs_node *kn; - uint files = 0; int ret; prdtgrp = rdtgroup_kn_lock_live(parent_kn); @@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (rdt_mon_capable) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + ret = rdtgroup_add_files(kn, files); if (ret) { rdt_last_cmd_puts("kernfs fill error\n"); @@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); + if (resctrl_debug) + seq_puts(seq, ",debug"); + return 0; } @@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { .show_options = rdtgroup_show_options, }; -static int __init rdtgroup_setup_root(void) +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) { - int ret; - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, @@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void) if (IS_ERR(rdt_root)) return PTR_ERR(rdt_root); + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void __init rdtgroup_setup_default(void) +{ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; @@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); - if (ret) { - kernfs_destroy_root(rdt_root); - goto out; - } - - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - kernfs_activate(rdtgroup_default.kn); - -out: mutex_unlock(&rdtgroup_mutex); - - return ret; } static void domain_destroy_mon_state(struct rdt_domain *d) @@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void) seq_buf_init(&last_cmd_status, last_cmd_status_buf, sizeof(last_cmd_status_buf)); - ret = rdtgroup_setup_root(); - if (ret) - return ret; + rdtgroup_setup_default(); ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) - goto cleanup_root; + return ret; ret = register_filesystem(&rdt_fs_type); if (ret) @@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); -cleanup_root: - kernfs_destroy_root(rdt_root); return ret; } @@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void) debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); - kernfs_destroy_root(rdt_root); } diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 0270925fe0..dc13670356 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) /* * initial apic id, which also represents 32-bit extended x2apic id. */ - c->initial_apicid = edx; + c->topo.initial_apicid = edx; smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; @@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * Populate HT related information from sub-leaf level 0. */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->initial_apicid = edx; + c->topo.initial_apicid = edx; core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, ht_mask_width) & core_select_mask; if (die_level_present) { - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, core_plus_mask_width) & die_select_mask; } - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - pkg_mask_width); + c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); __max_die_per_package = (die_level_siblings / core_level_siblings); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 05fa4ef634..415564a652 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); } - - if (c->cpuid_level >= 0x00000001) { - u32 eax, ebx, ecx, edx; - - cpuid(0x00000001, &eax, &ebx, &ecx, &edx); - /* - * If HTT (EDX[28]) is set EBX[16:23] contain the number of - * apicids which are reserved per package. Store the resulting - * shift value for the package management code. - */ - if (edx & (1U << 28)) - c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); - } - } static void init_zhaoxin(struct cpuinfo_x86 *c) |