diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
commit | 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch) | |
tree | a94efe259b9009378be6d90eb30d2b019d95c194 /drivers/edac/mce_amd.c | |
parent | Initial commit. (diff) | |
download | linux-upstream/5.10.209.tar.xz linux-upstream/5.10.209.zip |
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/edac/mce_amd.c')
-rw-r--r-- | drivers/edac/mce_amd.c | 1262 |
1 files changed, 1262 insertions, 0 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c new file mode 100644 index 000000000..b6d4ae84a --- /dev/null +++ b/drivers/edac/mce_amd.c @@ -0,0 +1,1262 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/slab.h> + +#include <asm/cpu.h> + +#include "mce_amd.h" + +static struct amd_decoder_ops fam_ops; + +static u8 xec_mask = 0xf; + +static void (*decode_dram_ecc)(int node_id, struct mce *m); + +void amd_register_ecc_decoder(void (*f)(int, struct mce *)) +{ + decode_dram_ecc = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) +{ + if (decode_dram_ecc) { + WARN_ON(decode_dram_ecc != f); + + decode_dram_ecc = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + +/* + * string representation for the different MCA reported error types, see F3x48 + * or MSR0000_0411. + */ + +/* transaction type */ +static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; + +/* cache level */ +static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; + +/* memory transaction type */ +static const char * const rrrr_msgs[] = { + "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" +}; + +/* participating processor */ +const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; +EXPORT_SYMBOL_GPL(pp_msgs); + +/* request timeout */ +static const char * const to_msgs[] = { "no timeout", "timed out" }; + +/* memory or i/o */ +static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; + +/* internal error type */ +static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; + +static const char * const f15h_mc1_mce_desc[] = { + "UC during a demand linefill from L2", + "Parity error during data load from IC", + "Parity error for IC valid bit", + "Main tag parity error", + "Parity error in prediction queue", + "PFB data/address parity error", + "Parity error in the branch status reg", + "PFB promotion address error", + "Tag error during probe/victimization", + "Parity error for IC probe tag valid bit", + "PFB non-cacheable bit parity error", + "PFB valid bit parity error", /* xec = 0xd */ + "Microcode Patch Buffer", /* xec = 010 */ + "uop queue", + "insn buffer", + "predecode buffer", + "fetch address FIFO", + "dispatch uop queue" +}; + +static const char * const f15h_mc2_mce_desc[] = { + "Fill ECC error on data fills", /* xec = 0x4 */ + "Fill parity error on insn fills", + "Prefetcher request FIFO parity error", + "PRQ address parity error", + "PRQ data parity error", + "WCC Tag ECC error", + "WCC Data ECC error", + "WCB Data parity error", + "VB Data ECC or parity error", + "L2 Tag ECC error", /* xec = 0x10 */ + "Hard L2 Tag ECC error", + "Multiple hits on L2 tag", + "XAB parity error", + "PRB address parity error" +}; + +static const char * const mc4_mce_desc[] = { + "DRAM ECC error detected on the NB", + "CRC error detected on HT link", + "Link-defined sync error packets detected on HT link", + "HT Master abort", + "HT Target abort", + "Invalid GART PTE entry during GART table walk", + "Unsupported atomic RMW received from an IO link", + "Watchdog timeout due to lack of progress", + "DRAM ECC error detected on the NB", + "SVM DMA Exclusion Vector error", + "HT data error detected on link", + "Protocol error (link, L3, probe filter)", + "NB internal arrays parity error", + "DRAM addr/ctl signals parity error", + "IO link transmission error", + "L3 data cache ECC error", /* xec = 0x1c */ + "L3 cache tag error", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" +}; + +static const char * const mc5_mce_desc[] = { + "CPU Watchdog timer expire", + "Wakeup array dest tag", + "AG payload array", + "EX payload array", + "IDRF array", + "Retire dispatch queue", + "Mapper checkpoint array", + "Physical register file EX0 port", + "Physical register file EX1 port", + "Physical register file AG0 port", + "Physical register file AG1 port", + "Flag register file", + "DE error occurred", + "Retire status queue" +}; + +static const char * const mc6_mce_desc[] = { + "Hardware Assertion", + "Free List", + "Physical Register File", + "Retire Queue", + "Scheduler table", + "Status Register File", +}; + +/* Scalable MCA error strings */ +static const char * const smca_ls_mce_desc[] = { + "Load queue parity error", + "Store queue parity error", + "Miss address buffer payload parity error", + "Level 1 TLB parity error", + "DC Tag error type 5", + "DC Tag error type 6", + "DC Tag error type 1", + "Internal error type 1", + "Internal error type 2", + "System Read Data Error Thread 0", + "System Read Data Error Thread 1", + "DC Tag error type 2", + "DC Data error type 1 and poison consumption", + "DC Data error type 2", + "DC Data error type 3", + "DC Tag error type 4", + "Level 2 TLB parity error", + "PDC parity error", + "DC Tag error type 3", + "DC Tag error type 5", + "L2 Fill Data error", +}; + +static const char * const smca_ls2_mce_desc[] = { + "An ECC error was detected on a data cache read by a probe or victimization", + "An ECC error or L2 poison was detected on a data cache read by a load", + "An ECC error was detected on a data cache read-modify-write by a store", + "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", + "An ECC error or poison bit mismatch was detected on a tag read by a load", + "An ECC error or poison bit mismatch was detected on a tag read by a store", + "An ECC error was detected on an EMEM read by a load", + "An ECC error was detected on an EMEM read-modify-write by a store", + "A parity error was detected in an L1 TLB entry by any access", + "A parity error was detected in an L2 TLB entry by any access", + "A parity error was detected in a PWC entry by any access", + "A parity error was detected in an STQ entry by any access", + "A parity error was detected in an LDQ entry by any access", + "A parity error was detected in a MAB entry by any access", + "A parity error was detected in an SCB entry state field by any access", + "A parity error was detected in an SCB entry address field by any access", + "A parity error was detected in an SCB entry data field by any access", + "A parity error was detected in a WCB entry by any access", + "A poisoned line was detected in an SCB entry by any access", + "A SystemReadDataError error was reported on read data returned from L2 for a load", + "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", + "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", + "A hardware assertion error was reported", + "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", +}; + +static const char * const smca_if_mce_desc[] = { + "Op Cache Microtag Probe Port Parity Error", + "IC Microtag or Full Tag Multi-hit Error", + "IC Full Tag Parity Error", + "IC Data Array Parity Error", + "Decoupling Queue PhysAddr Parity Error", + "L0 ITLB Parity Error", + "L1 ITLB Parity Error", + "L2 ITLB Parity Error", + "BPQ Thread 0 Snoop Parity Error", + "BPQ Thread 1 Snoop Parity Error", + "L1 BTB Multi-Match Error", + "L2 BTB Multi-Match Error", + "L2 Cache Response Poison Error", + "System Read Data Error", + "Hardware Assertion Error", + "L1-TLB Multi-Hit", + "L2-TLB Multi-Hit", + "BSR Parity Error", + "CT MCE", +}; + +static const char * const smca_l2_mce_desc[] = { + "L2M Tag Multiple-Way-Hit error", + "L2M Tag or State Array ECC Error", + "L2M Data Array ECC Error", + "Hardware Assert Error", +}; + +static const char * const smca_de_mce_desc[] = { + "Micro-op cache tag parity error", + "Micro-op cache data parity error", + "Instruction buffer parity error", + "Micro-op queue parity error", + "Instruction dispatch queue parity error", + "Fetch address FIFO parity error", + "Patch RAM data parity error", + "Patch RAM sequencer parity error", + "Micro-op buffer parity error", + "Hardware Assertion MCA Error", +}; + +static const char * const smca_ex_mce_desc[] = { + "Watchdog Timeout error", + "Physical register file parity error", + "Flag register file parity error", + "Immediate displacement register file parity error", + "Address generator payload parity error", + "EX payload parity error", + "Checkpoint queue parity error", + "Retire dispatch queue parity error", + "Retire status queue parity error", + "Scheduling queue parity error", + "Branch buffer queue parity error", + "Hardware Assertion error", + "Spec Map parity error", + "Retire Map parity error", +}; + +static const char * const smca_fp_mce_desc[] = { + "Physical register file (PRF) parity error", + "Freelist (FL) parity error", + "Schedule queue parity error", + "NSQ parity error", + "Retire queue (RQ) parity error", + "Status register file (SRF) parity error", + "Hardware assertion", +}; + +static const char * const smca_l3_mce_desc[] = { + "Shadow Tag Macro ECC Error", + "Shadow Tag Macro Multi-way-hit Error", + "L3M Tag ECC Error", + "L3M Tag Multi-way-hit Error", + "L3M Data ECC Error", + "SDP Parity Error or SystemReadDataError from XI", + "L3 Victim Queue Parity Error", + "L3 Hardware Assertion", +}; + +static const char * const smca_cs_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "Probe Filter ECC Error", +}; + +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", +}; + +static const char * const smca_pie_mce_desc[] = { + "Hardware Assert", + "Register security violation", + "Link Error", + "Poison data consumption", + "A deferred error was detected in the DF" +}; + +static const char * const smca_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error", + "SDP parity error", + "Advanced peripheral bus error", + "Address/Command parity error", + "Write data CRC error", + "DCQ SRAM ECC error", + "AES SRAM ECC error", +}; + +static const char * const smca_pb_mce_desc[] = { + "An ECC error in the Parameter Block RAM array", +}; + +static const char * const smca_psp_mce_desc[] = { + "An ECC or parity error in a PSP RAM instance", +}; + +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", +}; + +static const char * const smca_smu_mce_desc[] = { + "An ECC or parity error in an SMU RAM instance", +}; + +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + "PHY RAM ECC error", +}; + +static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", +}; + +static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", + "SDP ErrEvent error", + "SDP Egress Poison Error", + "IOHC Internal Poison Error", +}; + +static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", + "CCIX Write Response with Status: Non-Data Error", + "CCIX Read Response with Status: Data Error", + "CCIX Non-okay write response with data error", +}; + +struct smca_mce_desc { + const char * const *descs; + unsigned int num_descs; +}; + +static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, + [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, + [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, + [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, + [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, + [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, + [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, + [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, + [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, + [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, + [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, +}; + +static bool f12h_mc0_mce(u16 ec, u8 xec) +{ + bool ret = false; + + if (MEM_ERROR(ec)) { + u8 ll = LL(ec); + ret = true; + + if (ll == LL_L2) + pr_cont("during L1 linefill from L2.\n"); + else if (ll == LL_L1) + pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); + else + ret = false; + } + return ret; +} + +static bool f10h_mc0_mce(u16 ec, u8 xec) +{ + if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { + pr_cont("during data scrub.\n"); + return true; + } + return f12h_mc0_mce(ec, xec); +} + +static bool k8_mc0_mce(u16 ec, u8 xec) +{ + if (BUS_ERROR(ec)) { + pr_cont("during system linefill.\n"); + return true; + } + + return f10h_mc0_mce(ec, xec); +} + +static bool cat_mc0_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + bool ret = true; + + if (MEM_ERROR(ec)) { + + if (TT(ec) != TT_DATA || LL(ec) != LL_L1) + return false; + + switch (r4) { + case R4_DRD: + case R4_DWR: + pr_cont("Data/Tag parity error due to %s.\n", + (r4 == R4_DRD ? "load/hw prf" : "store")); + break; + case R4_EVICT: + pr_cont("Copyback parity error on a tag miss.\n"); + break; + case R4_SNOOP: + pr_cont("Tag parity error during snoop.\n"); + break; + default: + ret = false; + } + } else if (BUS_ERROR(ec)) { + + if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) + return false; + + pr_cont("System read data error on a "); + + switch (r4) { + case R4_RD: + pr_cont("TLB reload.\n"); + break; + case R4_DWR: + pr_cont("store.\n"); + break; + case R4_DRD: + pr_cont("load.\n"); + break; + default: + ret = false; + } + } else { + ret = false; + } + + return ret; +} + +static bool f15h_mc0_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (MEM_ERROR(ec)) { + + switch (xec) { + case 0x0: + pr_cont("Data Array access error.\n"); + break; + + case 0x1: + pr_cont("UC error during a linefill from L2/NB.\n"); + break; + + case 0x2: + case 0x11: + pr_cont("STQ access error.\n"); + break; + + case 0x3: + pr_cont("SCB access error.\n"); + break; + + case 0x10: + pr_cont("Tag error.\n"); + break; + + case 0x12: + pr_cont("LDQ access error.\n"); + break; + + default: + ret = false; + } + } else if (BUS_ERROR(ec)) { + + if (!xec) + pr_cont("System Read Data Error.\n"); + else + pr_cont(" Internal error condition type %d.\n", xec); + } else if (INT_ERROR(ec)) { + if (xec <= 0x1f) + pr_cont("Hardware Assert.\n"); + else + ret = false; + + } else + ret = false; + + return ret; +} + +static void decode_mc0_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC0 Error: "); + + /* TLB error signatures are the same across families */ + if (TLB_ERROR(ec)) { + if (TT(ec) == TT_DATA) { + pr_cont("%s TLB %s.\n", LL_MSG(ec), + ((xec == 2) ? "locked miss" + : (xec ? "multimatch" : "parity"))); + return; + } + } else if (fam_ops.mc0_mce(ec, xec)) + ; + else + pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); +} + +static bool k8_mc1_mce(u16 ec, u8 xec) +{ + u8 ll = LL(ec); + bool ret = true; + + if (!MEM_ERROR(ec)) + return false; + + if (ll == 0x2) + pr_cont("during a linefill from L2.\n"); + else if (ll == 0x1) { + switch (R4(ec)) { + case R4_IRD: + pr_cont("Parity error during data load.\n"); + break; + + case R4_EVICT: + pr_cont("Copyback Parity/Victim error.\n"); + break; + + case R4_SNOOP: + pr_cont("Tag Snoop error.\n"); + break; + + default: + ret = false; + break; + } + } else + ret = false; + + return ret; +} + +static bool cat_mc1_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + bool ret = true; + + if (!MEM_ERROR(ec)) + return false; + + if (TT(ec) != TT_INSTR) + return false; + + if (r4 == R4_IRD) + pr_cont("Data/tag array parity error for a tag hit.\n"); + else if (r4 == R4_SNOOP) + pr_cont("Tag error during snoop/victimization.\n"); + else if (xec == 0x0) + pr_cont("Tag parity error from victim castout.\n"); + else if (xec == 0x2) + pr_cont("Microcode patch RAM parity error.\n"); + else + ret = false; + + return ret; +} + +static bool f15h_mc1_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (!MEM_ERROR(ec)) + return false; + + switch (xec) { + case 0x0 ... 0xa: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); + break; + + case 0xd: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); + break; + + case 0x10: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); + break; + + case 0x11 ... 0x15: + pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); + break; + + default: + ret = false; + } + return ret; +} + +static void decode_mc1_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC1 Error: "); + + if (TLB_ERROR(ec)) + pr_cont("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + else if (BUS_ERROR(ec)) { + bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); + + pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); + } else if (INT_ERROR(ec)) { + if (xec <= 0x3f) + pr_cont("Hardware Assert.\n"); + else + goto wrong_mc1_mce; + } else if (fam_ops.mc1_mce(ec, xec)) + ; + else + goto wrong_mc1_mce; + + return; + +wrong_mc1_mce: + pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); +} + +static bool k8_mc2_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (xec == 0x1) + pr_cont(" in the write data buffers.\n"); + else if (xec == 0x3) + pr_cont(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n", + TT_MSG(ec)); + else if (BUS_ERROR(ec)) + pr_cont(": %s/ECC error in data read from NB: %s.\n", + R4_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 r4 = R4(ec); + + if (r4 >= 0x7) + pr_cont(": %s error during data copyback.\n", + R4_MSG(ec)); + else if (r4 <= 0x1) + pr_cont(": %s parity/ECC error during data " + "access from L2.\n", R4_MSG(ec)); + else + ret = false; + } else + ret = false; + } else + ret = false; + + return ret; +} + +static bool f15h_mc2_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (TLB_ERROR(ec)) { + if (xec == 0x0) + pr_cont("Data parity TLB read error.\n"); + else if (xec == 0x1) + pr_cont("Poison data provided for TLB fill.\n"); + else + ret = false; + } else if (BUS_ERROR(ec)) { + if (xec > 2) + ret = false; + + pr_cont("Error during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + switch (xec) { + case 0x4 ... 0xc: + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); + break; + + case 0x10 ... 0x14: + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); + break; + + default: + ret = false; + } + } else if (INT_ERROR(ec)) { + if (xec <= 0x3f) + pr_cont("Hardware Assert.\n"); + else + ret = false; + } + + return ret; +} + +static bool f16h_mc2_mce(u16 ec, u8 xec) +{ + u8 r4 = R4(ec); + + if (!MEM_ERROR(ec)) + return false; + + switch (xec) { + case 0x04 ... 0x05: + pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); + break; + + case 0x09 ... 0x0b: + case 0x0d ... 0x0f: + pr_cont("ECC error in L2 tag (%s).\n", + ((r4 == R4_GEN) ? "BankReq" : + ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); + break; + + case 0x10 ... 0x19: + case 0x1b: + pr_cont("ECC error in L2 data array (%s).\n", + (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : + ((r4 == R4_EVICT) ? "Vict" : "Fill")))); + break; + + case 0x1c ... 0x1d: + case 0x1f: + pr_cont("Parity error in L2 attribute bits (%s).\n", + ((r4 == R4_RD) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : "Fill"))); + break; + + default: + return false; + } + + return true; +} + +static void decode_mc2_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC2 Error: "); + + if (!fam_ops.mc2_mce(ec, xec)) + pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); +} + +static void decode_mc3_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + if (boot_cpu_data.x86 >= 0x14) { + pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," + " please report on LKML.\n"); + return; + } + + pr_emerg(HW_ERR "MC3 Error"); + + if (xec == 0x0) { + u8 r4 = R4(ec); + + if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) + goto wrong_mc3_mce; + + pr_cont(" during %s.\n", R4_MSG(ec)); + } else + goto wrong_mc3_mce; + + return; + + wrong_mc3_mce: + pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); +} + +static void decode_mc4_mce(struct mce *m) +{ + unsigned int fam = x86_family(m->cpuid); + int node_id = amd_get_nb_id(m->extcpu); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, 0x1f); + u8 offset = 0; + + pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); + + switch (xec) { + case 0x0 ... 0xe: + + /* special handling for DRAM ECCs */ + if (xec == 0x0 || xec == 0x8) { + /* no ECCs on F11h */ + if (fam == 0x11) + goto wrong_mc4_mce; + + pr_cont("%s.\n", mc4_mce_desc[xec]); + + if (decode_dram_ecc) + decode_dram_ecc(node_id, m); + return; + } + break; + + case 0xf: + if (TLB_ERROR(ec)) + pr_cont("GART Table Walk data error.\n"); + else if (BUS_ERROR(ec)) + pr_cont("DMA Exclusion Vector Table Walk error.\n"); + else + goto wrong_mc4_mce; + return; + + case 0x19: + if (fam == 0x15 || fam == 0x16) + pr_cont("Compute Unit Data Error.\n"); + else + goto wrong_mc4_mce; + return; + + case 0x1c ... 0x1f: + offset = 13; + break; + + default: + goto wrong_mc4_mce; + } + + pr_cont("%s.\n", mc4_mce_desc[xec - offset]); + return; + + wrong_mc4_mce: + pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); +} + +static void decode_mc5_mce(struct mce *m) +{ + unsigned int fam = x86_family(m->cpuid); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + if (fam == 0xf || fam == 0x11) + goto wrong_mc5_mce; + + pr_emerg(HW_ERR "MC5 Error: "); + + if (INT_ERROR(ec)) { + if (xec <= 0x1f) { + pr_cont("Hardware Assert.\n"); + return; + } else + goto wrong_mc5_mce; + } + + if (xec == 0x0 || xec == 0xc) + pr_cont("%s.\n", mc5_mce_desc[xec]); + else if (xec <= 0xd) + pr_cont("%s parity error.\n", mc5_mce_desc[xec]); + else + goto wrong_mc5_mce; + + return; + + wrong_mc5_mce: + pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); +} + +static void decode_mc6_mce(struct mce *m) +{ + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC6 Error: "); + + if (xec > 0x5) + goto wrong_mc6_mce; + + pr_cont("%s parity error.\n", mc6_mce_desc[xec]); + return; + + wrong_mc6_mce: + pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_error(struct mce *m) +{ + struct smca_hwid *hwid; + enum smca_bank_types bank_type; + const char *ip_name; + u8 xec = XEC(m->status, xec_mask); + + if (m->bank >= ARRAY_SIZE(smca_banks)) + return; + + hwid = smca_banks[m->bank].hwid; + if (!hwid) + return; + + bank_type = hwid->bank_type; + + if (bank_type == SMCA_RESERVED) { + pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank); + return; + } + + ip_name = smca_get_long_name(bank_type); + + pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); + + /* Only print the decode of valid error codes */ + if (xec < smca_mce_descs[bank_type].num_descs) + pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); + + if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) + decode_dram_ecc(topology_die_id(m->extcpu), m); +} + +static inline void amd_decode_err_code(u16 ec) +{ + if (INT_ERROR(ec)) { + pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); + return; + } + + pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); + + if (BUS_ERROR(ec)) + pr_cont(", mem/io: %s", II_MSG(ec)); + else + pr_cont(", tx: %s", TT_MSG(ec)); + + if (MEM_ERROR(ec) || BUS_ERROR(ec)) { + pr_cont(", mem-tx: %s", R4_MSG(ec)); + + if (BUS_ERROR(ec)) + pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); + } + + pr_cont("\n"); +} + +static const char *decode_error_status(struct mce *m) +{ + if (m->status & MCI_STATUS_UC) { + if (m->status & MCI_STATUS_PCC) + return "System Fatal error."; + if (m->mcgstatus & MCG_STATUS_RIPV) + return "Uncorrected, software restartable error."; + return "Uncorrected, software containable error."; + } + + if (m->status & MCI_STATUS_DEFERRED) + return "Deferred error, no action required."; + + return "Corrected error, no action required."; +} + +static int +amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + unsigned int fam = x86_family(m->cpuid); + int ecc; + + if (m->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + + pr_emerg(HW_ERR "%s\n", decode_error_status(m)); + + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, + fam, x86_model(m->cpuid), x86_stepping(m->cpuid), + m->bank, + ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), + ((m->status & MCI_STATUS_UC) ? "UE" : + (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), + ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(addr, &low, &high) && + (low & MCI_CONFIG_MCAX)) + pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + + pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); + } + + /* do the two bits[14:13] together */ + ecc = (m->status >> 45) & 0x3; + if (ecc) + pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + + if (fam >= 0x15) { + pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); + + /* F15h, bank4, bit 43 is part of McaStatSubCache. */ + if (fam != 0x15 || m->bank != 4) + pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); + } + + if (fam >= 0x17) + pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); + + pr_cont("]: 0x%016llx\n", m->status); + + if (m->status & MCI_STATUS_ADDRV) + pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr); + + if (m->ppin) + pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid); + + if (m->status & MCI_STATUS_SYNDV) + pr_cont(", Syndrome: 0x%016llx", m->synd); + + pr_cont("\n"); + + decode_smca_error(m); + goto err_code; + } + + if (m->tsc) + pr_emerg(HW_ERR "TSC: %llu\n", m->tsc); + + /* Doesn't matter which member to test. */ + if (!fam_ops.mc0_mce) + goto err_code; + + switch (m->bank) { + case 0: + decode_mc0_mce(m); + break; + + case 1: + decode_mc1_mce(m); + break; + + case 2: + decode_mc2_mce(m); + break; + + case 3: + decode_mc3_mce(m); + break; + + case 4: + decode_mc4_mce(m); + break; + + case 5: + decode_mc5_mce(m); + break; + + case 6: + decode_mc6_mce(m); + break; + + default: + break; + } + + err_code: + amd_decode_err_code(m->status & 0xffff); + + m->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_OK; +} + +static struct notifier_block amd_mce_dec_nb = { + .notifier_call = amd_decode_mce, + .priority = MCE_PRIO_EDAC, +}; + +static int __init mce_amd_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) + return -ENODEV; + + if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return -ENODEV; + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + xec_mask = 0x3f; + goto out; + } + + switch (c->x86) { + case 0xf: + fam_ops.mc0_mce = k8_mc0_mce; + fam_ops.mc1_mce = k8_mc1_mce; + fam_ops.mc2_mce = k8_mc2_mce; + break; + + case 0x10: + fam_ops.mc0_mce = f10h_mc0_mce; + fam_ops.mc1_mce = k8_mc1_mce; + fam_ops.mc2_mce = k8_mc2_mce; + break; + + case 0x11: + fam_ops.mc0_mce = k8_mc0_mce; + fam_ops.mc1_mce = k8_mc1_mce; + fam_ops.mc2_mce = k8_mc2_mce; + break; + + case 0x12: + fam_ops.mc0_mce = f12h_mc0_mce; + fam_ops.mc1_mce = k8_mc1_mce; + fam_ops.mc2_mce = k8_mc2_mce; + break; + + case 0x14: + fam_ops.mc0_mce = cat_mc0_mce; + fam_ops.mc1_mce = cat_mc1_mce; + fam_ops.mc2_mce = k8_mc2_mce; + break; + + case 0x15: + xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f; + + fam_ops.mc0_mce = f15h_mc0_mce; + fam_ops.mc1_mce = f15h_mc1_mce; + fam_ops.mc2_mce = f15h_mc2_mce; + break; + + case 0x16: + xec_mask = 0x1f; + fam_ops.mc0_mce = cat_mc0_mce; + fam_ops.mc1_mce = cat_mc1_mce; + fam_ops.mc2_mce = f16h_mc2_mce; + break; + + case 0x17: + case 0x18: + pr_warn_once("Decoding supported only on Scalable MCA processors.\n"); + return -EINVAL; + + default: + printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); + return -EINVAL; + } + +out: + pr_info("MCE: In-kernel MCE decoding enabled.\n"); + + mce_register_decode_chain(&amd_mce_dec_nb); + + return 0; +} +early_initcall(mce_amd_init); + +#ifdef MODULE +static void __exit mce_amd_exit(void) +{ + mce_unregister_decode_chain(&amd_mce_dec_nb); +} + +MODULE_DESCRIPTION("AMD MCE decoder"); +MODULE_ALIAS("edac-mce-amd"); +MODULE_LICENSE("GPL"); +module_exit(mce_amd_exit); +#endif |