summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mce/threshold.c
blob: 89e31e1e5c9c1ef71258f2815027fcc03041bdc2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
/*
 * Common corrected MCE threshold handler code:
 */
#include <linux/interrupt.h>
#include <linux/kernel.h>

#include <asm/irq_vectors.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>

#include "internal.h"

static void default_threshold_interrupt(void)
{
	pr_err("Unexpected threshold interrupt at vector %x\n",
		THRESHOLD_APIC_VECTOR);
}

void (*mce_threshold_vector)(void) = default_threshold_interrupt;

DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
{
	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
	inc_irq_stat(irq_threshold_count);
	mce_threshold_vector();
	trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
	apic_eoi();
}

DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);

void mce_inherit_storm(unsigned int bank)
{
	struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);

	/*
	 * Previous CPU owning this bank had put it into storm mode,
	 * but the precise history of that storm is unknown. Assume
	 * the worst (all recent polls of the bank found a valid error
	 * logged). This will avoid the new owner prematurely declaring
	 * the storm has ended.
	 */
	storm->banks[bank].history = ~0ull;
	storm->banks[bank].timestamp = jiffies;
}

bool mce_get_storm_mode(void)
{
	return __this_cpu_read(storm_desc.poll_mode);
}

void mce_set_storm_mode(bool storm)
{
	__this_cpu_write(storm_desc.poll_mode, storm);
}

static void mce_handle_storm(unsigned int bank, bool on)
{
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
		mce_intel_handle_storm(bank, on);
		break;
	}
}

void cmci_storm_begin(unsigned int bank)
{
	struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);

	__set_bit(bank, this_cpu_ptr(mce_poll_banks));
	storm->banks[bank].in_storm_mode = true;

	/*
	 * If this is the first bank on this CPU to enter storm mode
	 * start polling.
	 */
	if (++storm->stormy_bank_count == 1)
		mce_timer_kick(true);
}

void cmci_storm_end(unsigned int bank)
{
	struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);

	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
	storm->banks[bank].history = 0;
	storm->banks[bank].in_storm_mode = false;

	/* If no banks left in storm mode, stop polling. */
	if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
		mce_timer_kick(false);
}

void mce_track_storm(struct mce *mce)
{
	struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
	unsigned long now = jiffies, delta;
	unsigned int shift = 1;
	u64 history = 0;

	/* No tracking needed for banks that do not support CMCI */
	if (storm->banks[mce->bank].poll_only)
		return;

	/*
	 * When a bank is in storm mode it is polled once per second and
	 * the history mask will record about the last minute of poll results.
	 * If it is not in storm mode, then the bank is only checked when
	 * there is a CMCI interrupt. Check how long it has been since
	 * this bank was last checked, and adjust the amount of "shift"
	 * to apply to history.
	 */
	if (!storm->banks[mce->bank].in_storm_mode) {
		delta = now - storm->banks[mce->bank].timestamp;
		shift = (delta + HZ) / HZ;
	}

	/* If it has been a long time since the last poll, clear history. */
	if (shift < NUM_HISTORY_BITS)
		history = storm->banks[mce->bank].history << shift;

	storm->banks[mce->bank].timestamp = now;

	/* History keeps track of corrected errors. VAL=1 && UC=0 */
	if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
		history |= 1;

	storm->banks[mce->bank].history = history;

	if (storm->banks[mce->bank].in_storm_mode) {
		if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
			return;
		printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
		mce_handle_storm(mce->bank, false);
		cmci_storm_end(mce->bank);
	} else {
		if (hweight64(history) < STORM_BEGIN_THRESHOLD)
			return;
		printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
		mce_handle_storm(mce->bank, true);
		cmci_storm_begin(mce->bank);
	}
}