diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mce/threshold.c')
-rw-r--r-- | arch/x86/kernel/cpu/mce/threshold.c | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd..89e31e1e5c 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_handle_storm(bank, on); + break; + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} |