diff options
Diffstat (limited to 'drivers/ras/amd/atl/umc.c')
-rw-r--r-- | drivers/ras/amd/atl/umc.c | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c new file mode 100644 index 0000000000..a1b4accf7b --- /dev/null +++ b/drivers/ras/amd/atl/umc.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Address Translation Library + * + * umc.c : Unified Memory Controller (UMC) topology helpers + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> + */ + +#include "internal.h" + +/* + * MI300 has a fixed, model-specific mapping between a UMC instance and + * its related Data Fabric Coherent Station instance. + * + * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the + * UMC instance within a Node. Use this to find the appropriate Coherent + * Station ID. + * + * Redundant bits were removed from the map below. + */ +static const u16 umc_coh_st_map[32] = { + 0x393, 0x293, 0x193, 0x093, + 0x392, 0x292, 0x192, 0x092, + 0x391, 0x291, 0x191, 0x091, + 0x390, 0x290, 0x190, 0x090, + 0x793, 0x693, 0x593, 0x493, + 0x792, 0x692, 0x592, 0x492, + 0x791, 0x691, 0x591, 0x491, + 0x790, 0x690, 0x590, 0x490, +}; + +#define UMC_ID_MI300 GENMASK(23, 12) +static u8 get_coh_st_inst_id_mi300(struct atl_err *err) +{ + u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid); + u8 i; + + for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) { + if (umc_id == umc_coh_st_map[i]) + break; + } + + WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map)); + + return i; +} + +/* XOR the bits in @val. */ +static u16 bitwise_xor_bits(u16 val) +{ + u16 tmp = 0; + u8 i; + + for (i = 0; i < 16; i++) + tmp ^= (val >> i) & 0x1; + + return tmp; +} + +struct xor_bits { + bool xor_enable; + u16 col_xor; + u32 row_xor; +}; + +#define NUM_BANK_BITS 4 +#define NUM_COL_BITS 5 +#define NUM_SID_BITS 2 + +static struct { + /* UMC::CH::AddrHashBank */ + struct xor_bits bank[NUM_BANK_BITS]; + + /* UMC::CH::AddrHashPC */ + struct xor_bits pc; + + /* UMC::CH::AddrHashPC2 */ + u8 bank_xor; +} addr_hash; + +static struct { + u8 bank[NUM_BANK_BITS]; + u8 col[NUM_COL_BITS]; + u8 sid[NUM_SID_BITS]; + u8 num_row_lo; + u8 num_row_hi; + u8 row_lo; + u8 row_hi; + u8 pc; +} bit_shifts; + +#define MI300_UMC_CH_BASE 0x90000 +#define MI300_ADDR_CFG (MI300_UMC_CH_BASE + 0x30) +#define MI300_ADDR_SEL (MI300_UMC_CH_BASE + 0x40) +#define MI300_COL_SEL_LO (MI300_UMC_CH_BASE + 0x50) +#define MI300_ADDR_SEL_2 (MI300_UMC_CH_BASE + 0xA4) +#define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8) +#define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0) +#define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4) + +#define ADDR_HASH_XOR_EN BIT(0) +#define ADDR_HASH_COL_XOR GENMASK(13, 1) +#define ADDR_HASH_ROW_XOR GENMASK(31, 14) +#define ADDR_HASH_BANK_XOR GENMASK(5, 0) + +#define ADDR_CFG_NUM_ROW_LO GENMASK(11, 8) +#define ADDR_CFG_NUM_ROW_HI GENMASK(15, 12) + +#define ADDR_SEL_BANK0 GENMASK(3, 0) +#define ADDR_SEL_BANK1 GENMASK(7, 4) +#define ADDR_SEL_BANK2 GENMASK(11, 8) +#define ADDR_SEL_BANK3 GENMASK(15, 12) +#define ADDR_SEL_BANK4 GENMASK(20, 16) +#define ADDR_SEL_ROW_LO GENMASK(27, 24) +#define ADDR_SEL_ROW_HI GENMASK(31, 28) + +#define COL_SEL_LO_COL0 GENMASK(3, 0) +#define COL_SEL_LO_COL1 GENMASK(7, 4) +#define COL_SEL_LO_COL2 GENMASK(11, 8) +#define COL_SEL_LO_COL3 GENMASK(15, 12) +#define COL_SEL_LO_COL4 GENMASK(19, 16) + +#define ADDR_SEL_2_BANK5 GENMASK(4, 0) +#define ADDR_SEL_2_CHAN GENMASK(15, 12) + +/* + * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used + * for hashing. + * + * Also, read UMC::CH::Addr{Cfg,Sel,Sel2} and UMC::CH:ColSelLo registers to + * get the values needed to reconstruct the normalized address. Apply additional + * offsets to the raw register values, as needed. + * + * Do this during module init, since the values will not change during run time. + * + * These registers are instantiated for each UMC across each AMD Node. + * However, they should be identically programmed due to the fixed hardware + * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a + * single global structure for simplicity. + */ +int get_umc_info_mi300(void) +{ + u32 temp; + int ret; + u8 i; + + for (i = 0; i < NUM_BANK_BITS; i++) { + ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp); + if (ret) + return ret; + + addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); + addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); + addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); + } + + ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp); + if (ret) + return ret; + + addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp); + addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp); + addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp); + + ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp); + if (ret) + return ret; + + addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp); + + ret = amd_smn_read(0, MI300_ADDR_CFG, &temp); + if (ret) + return ret; + + bit_shifts.num_row_hi = FIELD_GET(ADDR_CFG_NUM_ROW_HI, temp); + bit_shifts.num_row_lo = 10 + FIELD_GET(ADDR_CFG_NUM_ROW_LO, temp); + + ret = amd_smn_read(0, MI300_ADDR_SEL, &temp); + if (ret) + return ret; + + bit_shifts.bank[0] = 5 + FIELD_GET(ADDR_SEL_BANK0, temp); + bit_shifts.bank[1] = 5 + FIELD_GET(ADDR_SEL_BANK1, temp); + bit_shifts.bank[2] = 5 + FIELD_GET(ADDR_SEL_BANK2, temp); + bit_shifts.bank[3] = 5 + FIELD_GET(ADDR_SEL_BANK3, temp); + /* Use BankBit4 for the SID0 position. */ + bit_shifts.sid[0] = 5 + FIELD_GET(ADDR_SEL_BANK4, temp); + bit_shifts.row_lo = 12 + FIELD_GET(ADDR_SEL_ROW_LO, temp); + bit_shifts.row_hi = 24 + FIELD_GET(ADDR_SEL_ROW_HI, temp); + + ret = amd_smn_read(0, MI300_COL_SEL_LO, &temp); + if (ret) + return ret; + + bit_shifts.col[0] = 2 + FIELD_GET(COL_SEL_LO_COL0, temp); + bit_shifts.col[1] = 2 + FIELD_GET(COL_SEL_LO_COL1, temp); + bit_shifts.col[2] = 2 + FIELD_GET(COL_SEL_LO_COL2, temp); + bit_shifts.col[3] = 2 + FIELD_GET(COL_SEL_LO_COL3, temp); + bit_shifts.col[4] = 2 + FIELD_GET(COL_SEL_LO_COL4, temp); + + ret = amd_smn_read(0, MI300_ADDR_SEL_2, &temp); + if (ret) + return ret; + + /* Use BankBit5 for the SID1 position. */ + bit_shifts.sid[1] = 5 + FIELD_GET(ADDR_SEL_2_BANK5, temp); + bit_shifts.pc = 5 + FIELD_GET(ADDR_SEL_2_CHAN, temp); + + return 0; +} + +/* + * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must + * be converted to the intermediate normalized address (NA) before translating to a + * system physical address. + * + * The DRAM address includes bank, row, and column. Also included are bits for + * pseudochannel (PC) and stack ID (SID). + * + * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero + * + * The MCA address format is as follows: + * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} + * + * Additionally, the PC and Bank bits may be hashed. This must be accounted for before + * reconstructing the normalized address. + */ +#define MI300_UMC_MCA_COL GENMASK(5, 1) +#define MI300_UMC_MCA_BANK GENMASK(9, 6) +#define MI300_UMC_MCA_ROW GENMASK(24, 10) +#define MI300_UMC_MCA_PC BIT(25) +#define MI300_UMC_MCA_SID GENMASK(27, 26) + +static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) +{ + u16 i, col, row, bank, pc, sid; + u32 temp; + + col = FIELD_GET(MI300_UMC_MCA_COL, addr); + bank = FIELD_GET(MI300_UMC_MCA_BANK, addr); + row = FIELD_GET(MI300_UMC_MCA_ROW, addr); + pc = FIELD_GET(MI300_UMC_MCA_PC, addr); + sid = FIELD_GET(MI300_UMC_MCA_SID, addr); + + /* Calculate hash for each Bank bit. */ + for (i = 0; i < NUM_BANK_BITS; i++) { + if (!addr_hash.bank[i].xor_enable) + continue; + + temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); + temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); + bank ^= temp << i; + } + + /* Calculate hash for PC bit. */ + if (addr_hash.pc.xor_enable) { + temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); + temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); + /* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */ + temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor); + pc ^= temp; + } + + /* Reconstruct the normalized address starting with NA[4:0] = 0 */ + addr = 0; + + /* Column bits */ + for (i = 0; i < NUM_COL_BITS; i++) { + temp = (col >> i) & 0x1; + addr |= temp << bit_shifts.col[i]; + } + + /* Bank bits */ + for (i = 0; i < NUM_BANK_BITS; i++) { + temp = (bank >> i) & 0x1; + addr |= temp << bit_shifts.bank[i]; + } + + /* Row lo bits */ + for (i = 0; i < bit_shifts.num_row_lo; i++) { + temp = (row >> i) & 0x1; + addr |= temp << (i + bit_shifts.row_lo); + } + + /* Row hi bits */ + for (i = 0; i < bit_shifts.num_row_hi; i++) { + temp = (row >> (i + bit_shifts.num_row_lo)) & 0x1; + addr |= temp << (i + bit_shifts.row_hi); + } + + /* PC bit */ + addr |= pc << bit_shifts.pc; + + /* SID bits */ + for (i = 0; i < NUM_SID_BITS; i++) { + temp = (sid >> i) & 0x1; + addr |= temp << bit_shifts.sid[i]; + } + + pr_debug("Addr=0x%016lx", addr); + pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid); + + return addr; +} + +/* + * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire + * all memory within that DRAM row. This applies to the memory with a DRAM + * bank. + * + * To find the memory addresses, loop through permutations of the DRAM column + * bits and find the System Physical address of each. The column bits are used + * to calculate the intermediate Normalized address, so all permutations should + * be checked. + * + * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. + */ +#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) +static void retire_row_mi300(struct atl_err *a_err) +{ + unsigned long addr; + struct page *p; + u8 col; + + for (col = 0; col < MI300_NUM_COL; col++) { + a_err->addr &= ~MI300_UMC_MCA_COL; + a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col); + + addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); + if (IS_ERR_VALUE(addr)) + continue; + + addr = PHYS_PFN(addr); + + /* + * Skip invalid or already poisoned pages to avoid unnecessary + * error messages from memory_failure(). + */ + p = pfn_to_online_page(addr); + if (!p) + continue; + + if (PageHWPoison(p)) + continue; + + memory_failure(addr, 0); + } +} + +void amd_retire_dram_row(struct atl_err *a_err) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return retire_row_mi300(a_err); +} +EXPORT_SYMBOL_GPL(amd_retire_dram_row); + +static unsigned long get_addr(unsigned long addr) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return convert_dram_to_norm_addr_mi300(addr); + + return addr; +} + +#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) +static u8 get_die_id(struct atl_err *err) +{ + /* + * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this + * needs to be divided by 4 to get the internal Die ID. + */ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) { + u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid); + + return node_id >> 2; + } + + /* + * For CPUs, this is the AMD Node ID modulo the number + * of AMD Nodes per socket. + */ + return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg(); +} + +#define UMC_CHANNEL_NUM GENMASK(31, 20) +static u8 get_coh_st_inst_id(struct atl_err *err) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return get_coh_st_inst_id_mi300(err); + + return FIELD_GET(UMC_CHANNEL_NUM, err->ipid); +} + +unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) +{ + u8 socket_id = topology_physical_package_id(err->cpu); + u8 coh_st_inst_id = get_coh_st_inst_id(err); + unsigned long addr = get_addr(err->addr); + u8 die_id = get_die_id(err); + + pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx", + socket_id, die_id, coh_st_inst_id, addr); + + return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); +} |