1582 lines
52 KiB
C
1582 lines
52 KiB
C
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
|
||
/*
|
||
* Deal with Hypervisor Maintenance Interrupts
|
||
*
|
||
* Copyright 2013-2019 IBM Corp.
|
||
*/
|
||
|
||
#define pr_fmt(fmt) "HMI: " fmt
|
||
|
||
#include <skiboot.h>
|
||
#include <opal.h>
|
||
#include <opal-msg.h>
|
||
#include <processor.h>
|
||
#include <chiptod.h>
|
||
#include <xscom.h>
|
||
#include <xscom-p8-regs.h>
|
||
#include <xscom-p9-regs.h>
|
||
#include <xscom-p10-regs.h>
|
||
#include <pci.h>
|
||
#include <cpu.h>
|
||
#include <chip.h>
|
||
#include <pau-regs.h>
|
||
#include <npu-regs.h>
|
||
#include <npu2-regs.h>
|
||
#include <pau.h>
|
||
#include <npu2.h>
|
||
#include <npu.h>
|
||
#include <capp.h>
|
||
#include <nvram.h>
|
||
#include <cpu.h>
|
||
|
||
/*
|
||
* P9 HMER register layout:
|
||
* +===+==========+============================+========+===================+
|
||
* |Bit|Name |Description |PowerKVM|Action |
|
||
* | | | |HMI | |
|
||
* | | | |enabled | |
|
||
* | | | |for this| |
|
||
* | | | |bit ? | |
|
||
* +===+==========+============================+========+===================+
|
||
* |0 |malfunctio|A processor core in the |Yes |Raise attn from |
|
||
* | |n_allert |system has checkstopped | |sapphire resulting |
|
||
* | | |(failed recovery) and has | |xstop |
|
||
* | | |requested a CP Sparing | | |
|
||
* | | |to occur. This is | | |
|
||
* | | |broadcasted to every | | |
|
||
* | | |processor in the system | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |1 |Reserved |reserved |n/a | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |2 |proc_recv_|Processor recovery occurred |Yes |Log message and |
|
||
* | |done |error-bit in fir not masked | |continue working. |
|
||
* | | |(see bit 11) | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |3 |proc_recv_|Processor went through |Yes |Log message and |
|
||
* | |error_mask|recovery for an error which | |continue working. |
|
||
* | |ed |is actually masked for | | |
|
||
* | | |reporting | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |4 | |Timer facility experienced |Yes |Raise attn from |
|
||
* | |tfac_error|an error. | |sapphire resulting |
|
||
* | | |TB, DEC, HDEC, PURR or SPURR| |xstop |
|
||
* | | |may be corrupted (details in| | |
|
||
* | | |TFMR) | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |5 | |TFMR SPR itself is |Yes |Raise attn from |
|
||
* | |tfmr_parit|corrupted. | |sapphire resulting |
|
||
* | |y_error |Entire timing facility may | |xstop |
|
||
* | | |be compromised. | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |6 |ha_overflo| UPS (Uniterrupted Power |No |N/A |
|
||
* | |w_warning |System) Overflow indication | | |
|
||
* | | |indicating that the UPS | | |
|
||
* | | |DirtyAddrTable has | | |
|
||
* | | |reached a limit where it | | |
|
||
* | | |requires PHYP unload support| | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |7 |reserved |reserved |n/a |n/a |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |8 |xscom_fail|An XSCOM operation caused by|No |We handle it by |
|
||
* | | |a cache inhibited load/store| |manually reading |
|
||
* | | |from this thread failed. A | |HMER register. |
|
||
* | | |trap register is | | |
|
||
* | | |available. | | |
|
||
* | | | | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |9 |xscom_done|An XSCOM operation caused by|No |We handle it by |
|
||
* | | |a cache inhibited load/store| |manually reading |
|
||
* | | |from this thread completed. | |HMER register. |
|
||
* | | |If hypervisor | | |
|
||
* | | |intends to use this bit, it | | |
|
||
* | | |is responsible for clearing | | |
|
||
* | | |it before performing the | | |
|
||
* | | |xscom operation. | | |
|
||
* | | |NOTE: this bit should always| | |
|
||
* | | |be masked in HMEER | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |10 |reserved |reserved |n/a |n/a |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |11 |proc_recv_|Processor recovery occurred |y |Log message and |
|
||
* | |again |again before bit2 or bit3 | |continue working. |
|
||
* | | |was cleared | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |12-|reserved |was temperature sensor |n/a |n/a |
|
||
* |15 | |passed the critical point on| | |
|
||
* | | |the way up | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |16 | |SCOM has set a reserved FIR |No |n/a |
|
||
* | |scom_fir_h|bit to cause recovery | | |
|
||
* | |m | | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |17 |trig_fir_h|Debug trigger has set a |No |n/a |
|
||
* | |mi |reserved FIR bit to cause | | |
|
||
* | | |recovery | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |18 |reserved |reserved |n/a |n/a |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |19 |reserved |reserved |n/a |n/a |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |20 |hyp_resour|A hypervisor resource error |y |Raise attn from |
|
||
* | |ce_err |occurred: data parity error | |sapphire resulting |
|
||
* | | |on, SPRC0:3; SPR_Modereg or | |xstop. |
|
||
* | | |HMEER. | | |
|
||
* | | |Note: this bit will cause an| | |
|
||
* | | |check_stop when (HV=1, PR=0 | | |
|
||
* | | |and EE=0) | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |21-| |if bit 8 is active, the |No |We handle it by |
|
||
* |23 |xscom_stat|reason will be detailed in | |Manually reading |
|
||
* | |us |these bits. see chapter 11.1| |HMER register. |
|
||
* | | |This bits are information | | |
|
||
* | | |only and always masked | | |
|
||
* | | |(mask = '0') | | |
|
||
* | | |If hypervisor intends to use| | |
|
||
* | | |this bit, it is responsible | | |
|
||
* | | |for clearing it before | | |
|
||
* | | |performing the xscom | | |
|
||
* | | |operation. | | |
|
||
* |---+----------+----------------------------+--------+-------------------|
|
||
* |24-|Not |Not implemented |n/a |n/a |
|
||
* |63 |implemente| | | |
|
||
* | |d | | | |
|
||
* +-- +----------+----------------------------+--------+-------------------+
|
||
*
|
||
* Above HMER bits can be enabled/disabled by modifying
|
||
* SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h
|
||
* If you modify support for any of the bits listed above, please make sure
|
||
* you change the above table to refelct that.
|
||
*
|
||
* NOTE: Per Dave Larson, never enable 8,9,21-23
|
||
*/
|
||
|
||
/*
|
||
* P10 HMER register layout:
|
||
* Bit Name Description
|
||
* 0 malfunction_alert A processor core in the system has checkstopped
|
||
* (failed recovery). This is broadcasted to every
|
||
* processor in the system
|
||
*
|
||
* 1 reserved reserved
|
||
*
|
||
* 2 proc_rcvy_done Processor recovery occurred error-bit in fir not
|
||
* masked (see bit 11)
|
||
*
|
||
* 3 reserved reserved
|
||
*
|
||
* 4 tfac_error Timer facility experienced an error. TB, DEC,
|
||
* HDEC, PURR or SPURR may be corrupted (details in
|
||
* TFMR)
|
||
*
|
||
* 5 tfx_error Error occurred on transfer from tfac shadow to
|
||
* core
|
||
*
|
||
* 6 spurr_scale_limit Nominal frequency exceeded 399 percent
|
||
*
|
||
* 7 reserved reserved
|
||
*
|
||
* 8 xscom_fail An XSCOM operation caused by a cache inhibited
|
||
* load/store from this thread failed. A trap
|
||
* register is available.
|
||
*
|
||
* 9 xscom_done An XSCOM operation caused by a cache inhibited
|
||
* load/store from this thread completed. If
|
||
* hypervisor intends to use this bit, it is
|
||
* responsible for clearing it before performing the
|
||
* xscom operation. NOTE: this bit should always be
|
||
* masked in HMEER
|
||
*
|
||
* 10 reserved reserved
|
||
*
|
||
* 11 proc_rcvy_again Processor recovery occurred again before bit 2
|
||
* was cleared
|
||
*
|
||
* 12-15 reserved reserved
|
||
*
|
||
* 16 scom_fir_hmi An error inject to PC FIR has occurred to set HMI.
|
||
* This error inject can also set FIR(61) to cause
|
||
* recovery.
|
||
*
|
||
* 17 reserved reserved
|
||
*
|
||
* 18 trig_fir_hmi Debug trigger has occurred to set HMI. This
|
||
* trigger can also set FIR(60) to cause recovery
|
||
*
|
||
* 19-20 reserved reserved
|
||
*
|
||
* 21-23 xscom_status If bit 8 is active, the reason will be detailed in
|
||
* these bits. These bits are information only and
|
||
* always masked (mask = ‘0’) If hypervisor intends
|
||
* to use this field, it is responsible for clearing
|
||
* it before performing the xscom operation.
|
||
*
|
||
* 24:63 Not implemented Not implemented.
|
||
*
|
||
* P10 HMEER enabled bits:
|
||
* Name Action
|
||
* malfunction_alert Decode and log FIR bits.
|
||
* proc_rcvy_done Log and continue.
|
||
* tfac_error Log and attempt to recover time facilities.
|
||
* tfx_error Log and attempt to recover time facilities.
|
||
* spurr_scale_limit Log and continue. XXX?
|
||
* proc_rcvy_again Log and continue.
|
||
*/
|
||
|
||
/* Used for tracking cpu threads inside hmi handling. */
|
||
#define HMI_STATE_CLEANUP_DONE 0x100
|
||
#define CORE_THREAD_MASK 0x0ff
|
||
#define SUBCORE_THREAD_MASK(s_id, t_count) \
|
||
((((1UL) << (t_count)) - 1) << ((s_id) * (t_count)))
|
||
#define SINGLE_THREAD_MASK(t_id) ((1UL) << (t_id))
|
||
|
||
/*
|
||
* Number of iterations for the various timeouts. We can't use the timebase
|
||
* as it might be broken. We measured experimentally that 40 millions loops
|
||
* of cpu_relax() gives us more than 1s. The margin is comfortable enough.
|
||
*/
|
||
#define TIMEOUT_LOOPS 40000000
|
||
|
||
/* TFMR other errors. (other than bit 26 and 45) */
|
||
#define SPR_TFMR_OTHER_ERRORS \
|
||
(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \
|
||
SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \
|
||
SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \
|
||
SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT | \
|
||
SPR_TFMR_CHIP_TOD_INTERRUPT)
|
||
|
||
/* TFMR "all core" errors (sent to all threads) */
|
||
#define SPR_TFMR_CORE_ERRORS \
|
||
(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \
|
||
SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \
|
||
SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR | \
|
||
SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TFAC_XFER_ERROR)
|
||
|
||
/* TFMR "thread" errors */
|
||
#define SPR_TFMR_THREAD_ERRORS \
|
||
(SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \
|
||
SPR_TFMR_DEC_PARITY_ERR)
|
||
|
||
/*
|
||
* Starting from p9, core inits are setup to escalate all core
|
||
* local checkstop to system checkstop. Review this list when that changes.
|
||
*/
|
||
static const struct core_xstop_bit_info {
|
||
uint8_t bit; /* CORE FIR bit number */
|
||
enum OpalHMI_CoreXstopReason reason;
|
||
} xstop_bits[] = {
|
||
{ 3, CORE_CHECKSTOP_IFU_REGFILE },
|
||
{ 5, CORE_CHECKSTOP_IFU_LOGIC },
|
||
{ 8, CORE_CHECKSTOP_PC_DURING_RECOV },
|
||
{ 10, CORE_CHECKSTOP_ISU_REGFILE },
|
||
{ 12, CORE_CHECKSTOP_ISU_LOGIC },
|
||
{ 21, CORE_CHECKSTOP_FXU_LOGIC },
|
||
{ 25, CORE_CHECKSTOP_VSU_LOGIC },
|
||
{ 26, CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE },
|
||
{ 32, CORE_CHECKSTOP_LSU_REGFILE },
|
||
{ 36, CORE_CHECKSTOP_PC_FWD_PROGRESS },
|
||
{ 38, CORE_CHECKSTOP_LSU_LOGIC },
|
||
{ 45, CORE_CHECKSTOP_PC_LOGIC },
|
||
{ 48, CORE_CHECKSTOP_PC_HYP_RESOURCE },
|
||
{ 52, CORE_CHECKSTOP_PC_HANG_RECOV_FAILED },
|
||
{ 54, CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED },
|
||
{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
|
||
};
|
||
|
||
struct core_fir_bit_info {
|
||
uint8_t bit; /* CORE FIR bit number */
|
||
const char *reason;
|
||
};
|
||
|
||
static const struct core_fir_bit_info p9_recoverable_bits[] = {
|
||
{ 0, "IFU - SRAM (ICACHE parity, etc)" },
|
||
{ 2, "IFU - RegFile" },
|
||
{ 4, "IFU - Logic" },
|
||
{ 9, "ISU - RegFile" },
|
||
{ 11, "ISU - Logic" },
|
||
{ 13, "ISU - Recoverable due to not in MT window" },
|
||
{ 24, "VSU - Logic" },
|
||
{ 27, "VSU - DFU logic" },
|
||
{ 29, "LSU - SRAM (DCACHE parity, etc)" },
|
||
{ 31, "LSU - RegFile" },
|
||
/* The following 3 bits may be set by SRAM errors. */
|
||
{ 33, "LSU - TLB multi hit" },
|
||
{ 34, "LSU - SLB multi hit" },
|
||
{ 35, "LSU - ERAT multi hit" },
|
||
{ 37, "LSU - Logic" },
|
||
{ 39, "LSU - Recoverable due to not in MT window" },
|
||
{ 43, "PC - Thread hang recovery" },
|
||
};
|
||
|
||
static const struct core_fir_bit_info p10_core_fir_bits[] = {
|
||
{ 0, "IFU - SRAM recoverable error (ICACHE parity error, etc.)" },
|
||
{ 1, "PC - TC checkstop" },
|
||
{ 2, "IFU - RegFile recoverable error" },
|
||
{ 3, "IFU - RegFile core checkstop" },
|
||
{ 4, "IFU - Logic recoverable error" },
|
||
{ 5, "IFU - Logic core checkstop" },
|
||
{ 7, "VSU - Inference accumulator recoverable error" },
|
||
{ 8, "PC - Recovery core checkstop" },
|
||
{ 9, "VSU - Slice Target File (STF) recoverable error" },
|
||
{ 11, "ISU - Logic recoverable error" },
|
||
{ 12, "ISU - Logic core checkstop" },
|
||
{ 14, "ISU - Machine check received while ME=0 checkstop" },
|
||
{ 15, "ISU - UE from L2" },
|
||
{ 16, "ISU - Number of UEs from L2 above threshold" },
|
||
{ 17, "ISU - UE on CI load" },
|
||
{ 18, "MMU - TLB recoverable error" },
|
||
{ 19, "MMU - SLB error" },
|
||
{ 21, "MMU - CXT recoverable error" },
|
||
{ 22, "MMU - Logic core checkstop" },
|
||
{ 23, "MMU - MMU system checkstop" },
|
||
{ 24, "VSU - Logic recoverable error" },
|
||
{ 25, "VSU - Logic core checkstop" },
|
||
{ 26, "PC - In maint mode and recovery in progress" },
|
||
{ 28, "PC - PC system checkstop" },
|
||
{ 29, "LSU - SRAM recoverable error (DCACHE parity error, etc.)" },
|
||
{ 30, "LSU - Set deleted" },
|
||
{ 31, "LSU - RegFile recoverable error" },
|
||
{ 32, "LSU - RegFile core checkstop" },
|
||
{ 33, "MMU - TLB multi hit error occurred" },
|
||
{ 34, "MMU - SLB multi hit error occurred" },
|
||
{ 35, "LSU - ERAT multi hit error occurred" },
|
||
{ 36, "PC - Forward progress error" },
|
||
{ 37, "LSU - Logic recoverable error" },
|
||
{ 38, "LSU - Logic core checkstop" },
|
||
{ 41, "LSU - System checkstop" },
|
||
{ 43, "PC - Thread hang recoverable error" },
|
||
{ 45, "PC - Logic core checkstop" },
|
||
{ 47, "PC - TimeBase facility checkstop" },
|
||
{ 52, "PC - Hang recovery failed core checkstop" },
|
||
{ 53, "PC - Core internal hang detected" },
|
||
{ 55, "PC - Nest hang detected" },
|
||
{ 56, "PC - Other core chiplet recoverable error" },
|
||
{ 57, "PC - Other core chiplet core checkstop" },
|
||
{ 58, "PC - Other core chiplet system checkstop" },
|
||
{ 59, "PC - SCOM satellite error detected" },
|
||
{ 60, "PC - Debug trigger error inject" },
|
||
{ 61, "PC - SCOM or firmware recoverable error inject" },
|
||
{ 62, "PC - Firmware checkstop error inject" },
|
||
{ 63, "PC - Firmware SPRC / SPRD checkstop" },
|
||
};
|
||
|
||
static const struct nx_xstop_bit_info {
|
||
uint8_t bit; /* NX FIR bit number */
|
||
enum OpalHMI_NestAccelXstopReason reason;
|
||
} nx_dma_xstop_bits[] = {
|
||
{ 1, NX_CHECKSTOP_SHM_INVAL_STATE_ERR },
|
||
{ 15, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1 },
|
||
{ 16, NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2 },
|
||
{ 20, NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR },
|
||
{ 21, NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR },
|
||
{ 22, NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR },
|
||
{ 23, NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR },
|
||
{ 24, NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR },
|
||
{ 25, NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR },
|
||
{ 26, NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR },
|
||
{ 27, NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR },
|
||
{ 31, NX_CHECKSTOP_DMA_CRB_UE },
|
||
{ 32, NX_CHECKSTOP_DMA_CRB_SUE },
|
||
};
|
||
|
||
static const struct nx_xstop_bit_info nx_pbi_xstop_bits[] = {
|
||
{ 12, NX_CHECKSTOP_PBI_ISN_UE },
|
||
};
|
||
|
||
static struct lock hmi_lock = LOCK_UNLOCKED;
|
||
static uint32_t malf_alert_scom;
|
||
static uint32_t nx_status_reg;
|
||
static uint32_t nx_dma_engine_fir;
|
||
static uint32_t nx_pbi_fir;
|
||
|
||
static int setup_scom_addresses(void)
|
||
{
|
||
switch (proc_gen) {
|
||
case proc_gen_p8:
|
||
malf_alert_scom = P8_MALFUNC_ALERT;
|
||
nx_status_reg = P8_NX_STATUS_REG;
|
||
nx_dma_engine_fir = P8_NX_DMA_ENGINE_FIR;
|
||
nx_pbi_fir = P8_NX_PBI_FIR;
|
||
return 1;
|
||
case proc_gen_p9:
|
||
malf_alert_scom = P9_MALFUNC_ALERT;
|
||
nx_status_reg = P9_NX_STATUS_REG;
|
||
nx_dma_engine_fir = P9_NX_DMA_ENGINE_FIR;
|
||
nx_pbi_fir = P9_NX_PBI_FIR;
|
||
return 1;
|
||
case proc_gen_p10:
|
||
case proc_gen_p11:
|
||
malf_alert_scom = P10_MALFUNC_ALERT;
|
||
nx_status_reg = P10_NX_STATUS_REG;
|
||
nx_dma_engine_fir = P10_NX_DMA_ENGINE_FIR;
|
||
nx_pbi_fir = P10_NX_PBI_FIR;
|
||
return 1;
|
||
default:
|
||
prerror("%s: Unknown CPU type\n", __func__);
|
||
break;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover, uint64_t *out_flags)
|
||
{
|
||
size_t size;
|
||
|
||
/* Don't queue up event if recover == -1 */
|
||
if (recover == -1)
|
||
return 0;
|
||
|
||
/* set disposition */
|
||
if (recover == 1)
|
||
hmi_evt->disposition = OpalHMI_DISPOSITION_RECOVERED;
|
||
else if (recover == 0)
|
||
hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
|
||
|
||
/*
|
||
* V2 of struct OpalHMIEvent is of (5 * 64 bits) size and well packed
|
||
* structure. Hence use uint64_t pointer to pass entire structure
|
||
* using 5 params in generic message format. Instead of hard coding
|
||
* num_params divide the struct size by 8 bytes to get exact
|
||
* num_params value.
|
||
*/
|
||
size = ALIGN_UP(sizeof(*hmi_evt), sizeof(u64));
|
||
|
||
*out_flags |= OPAL_HMI_FLAGS_NEW_EVENT;
|
||
|
||
/* queue up for delivery to host. */
|
||
return _opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
|
||
size, hmi_evt);
|
||
}
|
||
|
||
static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
|
||
{
|
||
int rc;
|
||
|
||
switch (proc_gen) {
|
||
case proc_gen_p8:
|
||
rc = xscom_read(chip_id,
|
||
XSCOM_ADDR_P8_EX(core_id, P8_CORE_FIR), core_fir);
|
||
break;
|
||
case proc_gen_p9:
|
||
rc = xscom_read(chip_id,
|
||
XSCOM_ADDR_P9_EC(core_id, P9_CORE_FIR), core_fir);
|
||
break;
|
||
case proc_gen_p10:
|
||
case proc_gen_p11:
|
||
rc = xscom_read(chip_id,
|
||
XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIR), core_fir);
|
||
break;
|
||
default:
|
||
rc = OPAL_HARDWARE;
|
||
}
|
||
return rc;
|
||
}
|
||
|
||
static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
|
||
{
|
||
int rc;
|
||
|
||
switch (proc_gen) {
|
||
case proc_gen_p9:
|
||
rc = xscom_read(chip_id,
|
||
XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
|
||
break;
|
||
case proc_gen_p10:
|
||
case proc_gen_p11:
|
||
rc = xscom_read(chip_id,
|
||
XSCOM_ADDR_P10_EC(core_id, P10_CORE_WOF), core_wof);
|
||
break;
|
||
default:
|
||
rc = OPAL_HARDWARE;
|
||
}
|
||
return rc;
|
||
}
|
||
|
||
static bool decode_core_fir(struct cpu_thread *cpu,
|
||
struct OpalHMIEvent *hmi_evt)
|
||
{
|
||
uint64_t core_fir;
|
||
uint32_t core_id;
|
||
int i, swkup_rc;
|
||
bool found = false;
|
||
int64_t ret;
|
||
const char *loc;
|
||
|
||
/* Sanity check */
|
||
if (!cpu || !hmi_evt)
|
||
return false;
|
||
|
||
core_id = pir_to_core_id(cpu->pir);
|
||
|
||
/* Force the core to wakeup, otherwise reading core_fir is unrealiable
|
||
* if stop-state 5 is enabled.
|
||
*/
|
||
swkup_rc = dctl_set_special_wakeup(cpu);
|
||
|
||
/* Get CORE FIR register value. */
|
||
ret = read_core_fir(cpu->chip_id, core_id, &core_fir);
|
||
|
||
if (!swkup_rc)
|
||
dctl_clear_special_wakeup(cpu);
|
||
|
||
|
||
if (ret == OPAL_WRONG_STATE) {
|
||
/*
|
||
* CPU is asleep, so it probably didn't cause the checkstop.
|
||
* If no other HMI cause is found a "catchall" checkstop
|
||
* will be raised, so if this CPU should've been awake the
|
||
* error will be handled appropriately.
|
||
*/
|
||
prlog(PR_DEBUG,
|
||
"FIR read failed, chip %d core %d asleep\n",
|
||
cpu->chip_id, core_id);
|
||
return false;
|
||
} else if (ret != OPAL_SUCCESS) {
|
||
prerror("XSCOM error reading CORE FIR\n");
|
||
/* If the FIR can't be read, we should checkstop. */
|
||
return true;
|
||
}
|
||
|
||
if (!core_fir)
|
||
return false;
|
||
|
||
loc = chip_loc_code(cpu->chip_id);
|
||
prlog(PR_INFO, "[Loc: %s]: CHIP ID: %x, CORE ID: %x, FIR: %016llx\n",
|
||
loc ? loc : "Not Available",
|
||
cpu->chip_id, core_id, core_fir);
|
||
|
||
if (proc_gen == proc_gen_p10 || proc_gen == proc_gen_p11) {
|
||
for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
|
||
if (core_fir & PPC_BIT(p10_core_fir_bits[i].bit))
|
||
prlog(PR_INFO, " %s\n", p10_core_fir_bits[i].reason);
|
||
}
|
||
}
|
||
|
||
/* Check CORE FIR bits and populate HMI event with error info. */
|
||
for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) {
|
||
if (core_fir & PPC_BIT(xstop_bits[i].bit)) {
|
||
found = true;
|
||
hmi_evt->u.xstop_error.xstop_reason
|
||
|= cpu_to_be32(xstop_bits[i].reason);
|
||
}
|
||
}
|
||
return found;
|
||
}
|
||
|
||
static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
|
||
uint64_t *out_flags)
|
||
{
|
||
struct cpu_thread *cpu;
|
||
|
||
/* Initialize HMI event */
|
||
hmi_evt->severity = OpalHMI_SEV_FATAL;
|
||
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
|
||
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_CORE;
|
||
|
||
/*
|
||
* Check CORE FIRs and find the reason for core checkstop.
|
||
* Send a separate HMI event for each core that has checkstopped.
|
||
*/
|
||
for_each_cpu(cpu) {
|
||
/* GARDed CPUs are marked unavailable. Skip them. */
|
||
if (cpu->state == cpu_state_unavailable)
|
||
continue;
|
||
|
||
/* Only check on primaries (ie. core), not threads */
|
||
if (cpu->is_secondary)
|
||
continue;
|
||
|
||
/* Initialize xstop_error fields. */
|
||
hmi_evt->u.xstop_error.xstop_reason = 0;
|
||
hmi_evt->u.xstop_error.u.pir = cpu_to_be32(cpu->pir);
|
||
|
||
if (decode_core_fir(cpu, hmi_evt))
|
||
queue_hmi_event(hmi_evt, 0, out_flags);
|
||
}
|
||
}
|
||
|
||
static void find_capp_checkstop_reason(int flat_chip_id,
|
||
struct OpalHMIEvent *hmi_evt,
|
||
uint64_t *out_flags)
|
||
{
|
||
struct capp_info info;
|
||
struct phb *phb;
|
||
uint64_t capp_fir;
|
||
uint64_t capp_fir_mask;
|
||
uint64_t capp_fir_action0;
|
||
uint64_t capp_fir_action1;
|
||
uint64_t reg;
|
||
int64_t rc;
|
||
|
||
/* CAPP exists on P8 and P9 only */
|
||
if (proc_gen != proc_gen_p8 && proc_gen != proc_gen_p9)
|
||
return;
|
||
|
||
/* Find the CAPP on the chip associated with the HMI. */
|
||
for_each_phb(phb) {
|
||
/* get the CAPP info */
|
||
rc = capp_get_info(flat_chip_id, phb, &info);
|
||
if (rc == OPAL_PARAMETER)
|
||
continue;
|
||
|
||
if (xscom_read(flat_chip_id, info.capp_fir_reg, &capp_fir) ||
|
||
xscom_read(flat_chip_id, info.capp_fir_mask_reg,
|
||
&capp_fir_mask) ||
|
||
xscom_read(flat_chip_id, info.capp_fir_action0_reg,
|
||
&capp_fir_action0) ||
|
||
xscom_read(flat_chip_id, info.capp_fir_action1_reg,
|
||
&capp_fir_action1)) {
|
||
prerror("CAPP: Couldn't read CAPP#%d (PHB:#%x) FIR registers by XSCOM!\n",
|
||
info.capp_index, info.phb_index);
|
||
continue;
|
||
}
|
||
|
||
if (!(capp_fir & ~capp_fir_mask))
|
||
continue;
|
||
|
||
prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): FIR 0x%016llx mask 0x%016llx\n",
|
||
info.capp_index, info.phb_index, capp_fir,
|
||
capp_fir_mask);
|
||
prlog(PR_DEBUG, "CAPP#%d (PHB:#%x): ACTION0 0x%016llx, ACTION1 0x%016llx\n",
|
||
info.capp_index, info.phb_index, capp_fir_action0,
|
||
capp_fir_action1);
|
||
|
||
/*
|
||
* If this bit is set (=1) a Recoverable Error has been
|
||
* detected
|
||
*/
|
||
xscom_read(flat_chip_id, info.capp_err_status_ctrl_reg, ®);
|
||
if ((reg & PPC_BIT(0)) != 0) {
|
||
phb_lock(phb);
|
||
phb->ops->set_capp_recovery(phb);
|
||
phb_unlock(phb);
|
||
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
|
||
queue_hmi_event(hmi_evt, 1, out_flags);
|
||
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
static void find_nx_checkstop_reason(int flat_chip_id,
|
||
struct OpalHMIEvent *hmi_evt,
|
||
uint64_t *out_flags)
|
||
{
|
||
uint64_t nx_status;
|
||
uint64_t nx_dma_fir;
|
||
uint64_t nx_pbi_fir_val;
|
||
int i;
|
||
|
||
/* Get NX status register value. */
|
||
if (xscom_read(flat_chip_id, nx_status_reg, &nx_status) != 0) {
|
||
prerror("XSCOM error reading NX_STATUS_REG\n");
|
||
return;
|
||
}
|
||
|
||
/* Check if NX has driven an HMI interrupt. */
|
||
if (!(nx_status & NX_HMI_ACTIVE))
|
||
return;
|
||
|
||
/* Initialize HMI event */
|
||
hmi_evt->severity = OpalHMI_SEV_FATAL;
|
||
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
|
||
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NX;
|
||
hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id);
|
||
|
||
/* Get DMA & Engine FIR data register value. */
|
||
if (xscom_read(flat_chip_id, nx_dma_engine_fir, &nx_dma_fir) != 0) {
|
||
prerror("XSCOM error reading NX_DMA_ENGINE_FIR\n");
|
||
return;
|
||
}
|
||
|
||
/* Get PowerBus Interface FIR data register value. */
|
||
if (xscom_read(flat_chip_id, nx_pbi_fir, &nx_pbi_fir_val) != 0) {
|
||
prerror("XSCOM error reading NX_PBI_FIR\n");
|
||
return;
|
||
}
|
||
|
||
/* Find NX checkstop reason and populate HMI event with error info. */
|
||
for (i = 0; i < ARRAY_SIZE(nx_dma_xstop_bits); i++)
|
||
if (nx_dma_fir & PPC_BIT(nx_dma_xstop_bits[i].bit))
|
||
hmi_evt->u.xstop_error.xstop_reason
|
||
|= cpu_to_be32(nx_dma_xstop_bits[i].reason);
|
||
|
||
for (i = 0; i < ARRAY_SIZE(nx_pbi_xstop_bits); i++)
|
||
if (nx_pbi_fir_val & PPC_BIT(nx_pbi_xstop_bits[i].bit))
|
||
hmi_evt->u.xstop_error.xstop_reason
|
||
|= cpu_to_be32(nx_pbi_xstop_bits[i].reason);
|
||
|
||
/*
|
||
* Set NXDMAENGFIR[38] to signal PRD that service action is required.
|
||
* Without this inject, PRD will not be able to do NX unit checkstop
|
||
* error analysis. NXDMAENGFIR[38] is a spare bit and used to report
|
||
* a software initiated attention.
|
||
*
|
||
* The behavior of this bit and all FIR bits are documented in
|
||
* RAS spreadsheet.
|
||
*/
|
||
xscom_write(flat_chip_id, nx_dma_engine_fir, PPC_BIT(38));
|
||
|
||
/* Send an HMI event. */
|
||
queue_hmi_event(hmi_evt, 0, out_flags);
|
||
}
|
||
|
||
static void add_npu_xstop_reason(uint32_t *xstop_reason, uint8_t reason)
|
||
{
|
||
int i, reason_count;
|
||
uint8_t *ptr;
|
||
|
||
reason_count = sizeof(*xstop_reason) / sizeof(reason);
|
||
ptr = (uint8_t *) xstop_reason;
|
||
for (i = 0; i < reason_count; i++) {
|
||
if (*ptr == 0) {
|
||
*ptr = reason;
|
||
break;
|
||
}
|
||
ptr++;
|
||
}
|
||
}
|
||
|
||
static void encode_npu_xstop_reason(uint32_t *xstop_reason,
|
||
uint64_t fir, int fir_number)
|
||
{
|
||
int bit;
|
||
uint8_t reason;
|
||
|
||
/*
|
||
* There are three 64-bit FIRs but the xstop reason field of
|
||
* the hmi event is only 32-bit. Encode which FIR bit is set as:
|
||
* - 2 bits for the FIR number
|
||
* - 6 bits for the bit number (0 -> 63)
|
||
*
|
||
* So we could even encode up to 4 reasons for the HMI, if
|
||
* that can ever happen
|
||
*/
|
||
while (fir) {
|
||
bit = ilog2(fir);
|
||
reason = fir_number << 6;
|
||
reason |= (63 - bit); // IBM numbering
|
||
add_npu_xstop_reason(xstop_reason, reason);
|
||
fir ^= 1ULL << bit;
|
||
}
|
||
}
|
||
|
||
static bool npu_fir_errors(struct phb *phb, int flat_chip_id,
|
||
uint32_t *xstop_reason)
|
||
{
|
||
uint64_t fir, fir_mask;
|
||
uint64_t fir_action0, fir_action1;
|
||
uint64_t fir_reg, fir_mask_reg;
|
||
uint64_t fir_action0_reg, fir_action1_reg;
|
||
uint64_t fatal_errors;
|
||
uint64_t xscom_base;
|
||
bool fir_errors = false;
|
||
int fir_regs;
|
||
const char *loc;
|
||
struct npu *npu;
|
||
struct npu2 *npu2 = NULL;
|
||
struct npu2_dev *dev;
|
||
struct pau *pau = NULL;
|
||
|
||
fir_regs = (phb->phb_type == phb_type_pcie_v3) ? 1 : 3;
|
||
|
||
for (uint32_t i = 0; i < fir_regs; i++) {
|
||
switch (phb->phb_type) {
|
||
case phb_type_pcie_v3:
|
||
fir_reg = NX_FIR;
|
||
fir_mask_reg = NX_FIR_MASK;
|
||
fir_action0_reg = NX_FIR_ACTION0;
|
||
fir_action1_reg = NX_FIR_ACTION1;
|
||
|
||
npu = phb_to_npu(phb);
|
||
if (npu != NULL)
|
||
xscom_base = npu->at_xscom;
|
||
else
|
||
continue;
|
||
break;
|
||
case phb_type_npu_v2:
|
||
fir_reg = NPU2_FIR(i);
|
||
fir_mask_reg = NPU2_FIR_MASK(i);
|
||
fir_action0_reg = NPU2_FIR_ACTION0(i);
|
||
fir_action1_reg = NPU2_FIR_ACTION1(i);
|
||
npu2 = phb_to_npu2_nvlink(phb);
|
||
xscom_base = npu2->xscom_base;
|
||
break;
|
||
case phb_type_npu_v2_opencapi:
|
||
fir_reg = NPU2_FIR(i);
|
||
fir_mask_reg = NPU2_FIR_MASK(i);
|
||
fir_action0_reg = NPU2_FIR_ACTION0(i);
|
||
fir_action1_reg = NPU2_FIR_ACTION1(i);
|
||
dev = phb_to_npu2_dev_ocapi(phb);
|
||
npu2 = dev->npu;
|
||
xscom_base = npu2->xscom_base;
|
||
break;
|
||
case phb_type_pau_opencapi:
|
||
fir_reg = PAU_FIR(i);
|
||
fir_mask_reg = PAU_FIR_MASK(i);
|
||
fir_action0_reg = PAU_FIR_ACTION0(i);
|
||
fir_action1_reg = PAU_FIR_ACTION1(i);
|
||
pau = ((struct pau_dev *)(pau_phb_to_opencapi_dev(phb)))->pau;
|
||
xscom_base = pau->xscom_base;
|
||
break;
|
||
default:
|
||
continue;
|
||
}
|
||
|
||
if (xscom_read(flat_chip_id, xscom_base + fir_reg, &fir) ||
|
||
xscom_read(flat_chip_id, xscom_base + fir_mask_reg, &fir_mask) ||
|
||
xscom_read(flat_chip_id, xscom_base + fir_action0_reg, &fir_action0) ||
|
||
xscom_read(flat_chip_id, xscom_base + fir_action1_reg, &fir_action1)) {
|
||
prerror("HMI: Couldn't read NPU/PAU FIR register%d with XSCOM\n", i);
|
||
continue;
|
||
}
|
||
|
||
fatal_errors = fir & ~fir_mask & fir_action0 & fir_action1;
|
||
|
||
if (fatal_errors) {
|
||
loc = chip_loc_code(flat_chip_id);
|
||
if (!loc)
|
||
loc = "Not Available";
|
||
prlog(PR_ERR, "NPU/PAU: [Loc: %s] P:%d FIR#%d "
|
||
"FIR 0x%016llx mask 0x%016llx\n",
|
||
loc, flat_chip_id, i, fir, fir_mask);
|
||
prlog(PR_ERR, "NPU/PAU: [Loc: %s] P:%d ACTION0 "
|
||
"0x%016llx, ACTION1 0x%016llx\n",
|
||
loc, flat_chip_id, fir_action0, fir_action1);
|
||
if (phb->phb_type != phb_type_pcie_v3)
|
||
encode_npu_xstop_reason(xstop_reason,
|
||
fatal_errors,
|
||
i);
|
||
fir_errors = true;
|
||
}
|
||
}
|
||
|
||
/* dump registers */
|
||
if (fir_errors) {
|
||
switch (phb->phb_type) {
|
||
case phb_type_npu_v2:
|
||
case phb_type_npu_v2_opencapi:
|
||
npu2_dump_scoms(npu2, flat_chip_id);
|
||
break;
|
||
case phb_type_pau_opencapi:
|
||
pau_opencapi_dump_scoms(pau);
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
|
||
prlog(PR_ERR, " _________________________ \n");
|
||
prlog(PR_ERR, "< It's Debug time! >\n");
|
||
prlog(PR_ERR, " ------------------------- \n");
|
||
prlog(PR_ERR, " \\ ,__, \n");
|
||
prlog(PR_ERR, " \\ (oo)____ \n");
|
||
prlog(PR_ERR, " (__) )\\ \n");
|
||
prlog(PR_ERR, " ||--|| * \n");
|
||
}
|
||
|
||
return fir_errors;
|
||
}
|
||
|
||
static void find_npu_checkstop_reason(int flat_chip_id,
|
||
struct OpalHMIEvent *hmi_evt,
|
||
uint64_t *out_flags)
|
||
{
|
||
struct phb *phb;
|
||
struct dt_node *dn;
|
||
uint32_t xstop_reason = 0;
|
||
|
||
/* Only check for NPU errors if the chip has a NPU/PAU */
|
||
if ((PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL) &&
|
||
(PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9) &&
|
||
(PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P10))
|
||
return;
|
||
|
||
/* Find the NPU/PAU on the chip associated with the HMI. */
|
||
for_each_phb(phb) {
|
||
dn = phb->dt_node;
|
||
|
||
if (!(dt_node_is_compatible(dn, "ibm,power8-npu-pciex") ||
|
||
dt_node_is_compatible(dn, "ibm,power9-npu-pciex") ||
|
||
dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex") ||
|
||
dt_node_is_compatible(dn, "ibm,power10-pau-opencapi-pciex")))
|
||
continue;
|
||
|
||
if (dt_get_chip_id(dn) != flat_chip_id)
|
||
continue;
|
||
|
||
/* Read all the registers necessary to find a checkstop condition. */
|
||
if (!npu_fir_errors(phb, flat_chip_id, &xstop_reason))
|
||
continue;
|
||
|
||
if (phb->phb_type == phb_type_pcie_v3) {
|
||
/* Set the NPU to fenced since it can't recover. */
|
||
npu_set_fence_state(phb_to_npu(phb), true);
|
||
}
|
||
|
||
/* Set up the HMI event */
|
||
hmi_evt->severity = OpalHMI_SEV_WARNING;
|
||
hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
|
||
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
|
||
hmi_evt->u.xstop_error.xstop_reason = cpu_to_be32(xstop_reason);
|
||
hmi_evt->u.xstop_error.u.chip_id = cpu_to_be32(flat_chip_id);
|
||
|
||
/* Marking the event as recoverable so that we don't crash */
|
||
queue_hmi_event(hmi_evt, 1, out_flags);
|
||
}
|
||
}
|
||
|
||
static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
|
||
{
|
||
int i;
|
||
uint64_t malf_alert, flags;
|
||
|
||
flags = 0;
|
||
|
||
if (!setup_scom_addresses()) {
|
||
prerror("Failed to setup scom addresses\n");
|
||
/* Send an unknown HMI event. */
|
||
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
|
||
hmi_evt->u.xstop_error.xstop_reason = 0;
|
||
queue_hmi_event(hmi_evt, false, out_flags);
|
||
return;
|
||
}
|
||
|
||
xscom_read(this_cpu()->chip_id, malf_alert_scom, &malf_alert);
|
||
|
||
if (!malf_alert)
|
||
return;
|
||
|
||
for (i = 0; i < 64; i++) {
|
||
if (malf_alert & PPC_BIT(i)) {
|
||
xscom_write(this_cpu()->chip_id, malf_alert_scom,
|
||
~PPC_BIT(i));
|
||
find_capp_checkstop_reason(i, hmi_evt, &flags);
|
||
if (proc_gen != proc_gen_p10 && proc_gen != proc_gen_p11)
|
||
find_nx_checkstop_reason(i, hmi_evt, &flags);
|
||
find_npu_checkstop_reason(i, hmi_evt, &flags);
|
||
}
|
||
}
|
||
|
||
find_core_checkstop_reason(hmi_evt, &flags);
|
||
|
||
/*
|
||
* If we fail to find checkstop reason, send an unknown HMI event.
|
||
*/
|
||
if (!(flags & OPAL_HMI_FLAGS_NEW_EVENT)) {
|
||
hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
|
||
hmi_evt->u.xstop_error.xstop_reason = 0;
|
||
queue_hmi_event(hmi_evt, false, &flags);
|
||
}
|
||
*out_flags |= flags;
|
||
}
|
||
|
||
/*
|
||
* This will "rendez-vous" all threads on the core to the rendez-vous
|
||
* id "sig". You need to make sure that "sig" is different from the
|
||
* previous rendez vous. The sig value must be between 0 and 7 with
|
||
* boot time being set to 0.
|
||
*
|
||
* Note: in theory, we could just use a flip flop "sig" in the thread
|
||
* structure (binary rendez-vous with no argument). This is a bit more
|
||
* debuggable and better at handling timeouts (arguably).
|
||
*
|
||
* This should be called with the no lock held
|
||
*/
|
||
static void hmi_rendez_vous(uint32_t sig)
|
||
{
|
||
struct cpu_thread *t = this_cpu();
|
||
uint32_t my_id = cpu_get_thread_index(t);
|
||
uint32_t my_shift = my_id << 2;
|
||
uint32_t *sptr = t->core_hmi_state_ptr;
|
||
uint32_t val, prev, shift, i;
|
||
uint64_t timeout;
|
||
|
||
assert(sig <= 0x7);
|
||
|
||
/*
|
||
* Mark ourselves as having reached the rendez vous point with
|
||
* the exit bit cleared
|
||
*/
|
||
do {
|
||
val = prev = *sptr;
|
||
val &= ~(0xfu << my_shift);
|
||
val |= sig << my_shift;
|
||
} while (cmpxchg32(sptr, prev, val) != prev);
|
||
|
||
/*
|
||
* Wait for everybody else to reach that point, ignore the
|
||
* exit bit as another thread could have already set it.
|
||
*/
|
||
for (i = 0; i < cpu_thread_count; i++) {
|
||
shift = i << 2;
|
||
|
||
timeout = TIMEOUT_LOOPS;
|
||
while (((*sptr >> shift) & 0x7) != sig && --timeout)
|
||
cpu_relax();
|
||
if (!timeout)
|
||
prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x"
|
||
" waiting for thread %d (sptr=%08x)\n",
|
||
t->pir, i, *sptr);
|
||
}
|
||
|
||
/* Set the exit bit */
|
||
do {
|
||
val = prev = *sptr;
|
||
val &= ~(0xfu << my_shift);
|
||
val |= (sig | 8) << my_shift;
|
||
} while (cmpxchg32(sptr, prev, val) != prev);
|
||
|
||
/* At this point, we need to wait for everybody else to have a value
|
||
* that is *not* sig. IE. they either have set the exit bit *or* they
|
||
* have changed the rendez-vous (meaning they have moved on to another
|
||
* rendez vous point).
|
||
*/
|
||
for (i = 0; i < cpu_thread_count; i++) {
|
||
shift = i << 2;
|
||
|
||
timeout = TIMEOUT_LOOPS;
|
||
while (((*sptr >> shift) & 0xf) == sig && --timeout)
|
||
cpu_relax();
|
||
if (!timeout)
|
||
prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x"
|
||
" waiting for thread %d (sptr=%08x)\n",
|
||
t->pir, i, *sptr);
|
||
}
|
||
}
|
||
|
||
static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
|
||
{
|
||
const char *loc;
|
||
uint32_t core_id, thread_index;
|
||
|
||
core_id = pir_to_core_id(this_cpu()->pir);
|
||
thread_index = cpu_get_thread_index(this_cpu());
|
||
|
||
loc = chip_loc_code(this_cpu()->chip_id);
|
||
if (!loc)
|
||
loc = "Not Available";
|
||
|
||
/* Also covers P10 SPR_HMER_TFAC_SHADOW_XFER_ERROR */
|
||
if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
|
||
prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
|
||
loc, this_cpu()->chip_id, core_id, thread_index,
|
||
mfspr(SPR_TFMR), msg);
|
||
} else {
|
||
prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
|
||
loc, this_cpu()->chip_id, core_id, thread_index,
|
||
msg);
|
||
}
|
||
}
|
||
|
||
static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
|
||
{
|
||
int recover = 1;
|
||
|
||
if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
|
||
*out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
|
||
if (!tfmr_recover_local_errors(tfmr))
|
||
recover = 0;
|
||
tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR |
|
||
SPR_TFMR_SPURR_PARITY_ERR |
|
||
SPR_TFMR_DEC_PARITY_ERR);
|
||
return recover;
|
||
}
|
||
|
||
static int64_t opal_handle_hmi(void);
|
||
|
||
static void opal_handle_hmi_job(void *data __unused)
|
||
{
|
||
opal_handle_hmi();
|
||
}
|
||
|
||
/*
|
||
* Queue hmi handling job If secondaries are still in OPAL
|
||
* This function is called by thread 0.
|
||
*/
|
||
static struct cpu_job **hmi_kick_secondaries(void)
|
||
{
|
||
struct cpu_thread *ts = this_cpu();
|
||
struct cpu_job **hmi_jobs = NULL;
|
||
int job_sz = sizeof(struct cpu_job *) * cpu_thread_count;
|
||
int i;
|
||
|
||
for (i = 1; i < cpu_thread_count; i++) {
|
||
ts = next_cpu(ts);
|
||
|
||
/* Is this thread still in OPAL ? */
|
||
if (ts->state == cpu_state_active) {
|
||
if (!hmi_jobs) {
|
||
hmi_jobs = zalloc(job_sz);
|
||
assert(hmi_jobs);
|
||
}
|
||
|
||
prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i);
|
||
hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job",
|
||
opal_handle_hmi_job, NULL);
|
||
}
|
||
}
|
||
return hmi_jobs;
|
||
}
|
||
|
||
static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
|
||
{
|
||
struct cpu_thread *t, *t0;
|
||
int recover = -1;
|
||
struct cpu_job **hmi_jobs = NULL;
|
||
bool hmi_with_no_error = false;
|
||
|
||
t = this_cpu();
|
||
t0 = find_cpu_by_pir(cpu_get_thread0(t));
|
||
|
||
if (t == t0 && t0->state == cpu_state_os)
|
||
hmi_jobs = hmi_kick_secondaries();
|
||
|
||
/*
|
||
* Handle special case: If TB is in invalid state and no TB error
|
||
* reported in TFMR for this HMI, then treat this as TFMR corrupt error
|
||
* to force the recovery procedure recover_corrupt_tfmr(). This will
|
||
* also reset the core level TB erorrs including Missing step. Do this
|
||
* only on thread 0, otherwise every thread will repeat the same
|
||
* procedure unnecessarily.
|
||
*/
|
||
if (t == t0 && !(tfmr & SPR_TFMR_CORE_ERRORS) && this_cpu()->tb_invalid) {
|
||
tfmr |= SPR_TFMR_TFMR_CORRUPT;
|
||
hmi_with_no_error = true;
|
||
}
|
||
|
||
/* Rendez vous all threads */
|
||
hmi_rendez_vous(1);
|
||
|
||
/* We use a lock here as some of the TFMR bits are shared and I
|
||
* prefer avoiding doing the cleanup simultaneously.
|
||
*/
|
||
lock(&hmi_lock);
|
||
|
||
/* First handle corrupt TFMR otherwise we can't trust anything.
|
||
* We'll use a lock here so that the threads don't try to do it at
|
||
* the same time
|
||
*/
|
||
if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
|
||
/* Check if it's still in error state */
|
||
if (hmi_with_no_error || mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
|
||
if (!recover_corrupt_tfmr()) {
|
||
unlock(&hmi_lock);
|
||
recover = 0;
|
||
goto error_out;
|
||
}
|
||
|
||
tfmr = mfspr(SPR_TFMR);
|
||
|
||
/* We could have got new thread errors in the meantime */
|
||
if (tfmr & SPR_TFMR_THREAD_ERRORS) {
|
||
recover = handle_thread_tfac_error(tfmr, out_flags);
|
||
tfmr &= ~SPR_TFMR_THREAD_ERRORS;
|
||
}
|
||
if (!recover) {
|
||
unlock(&hmi_lock);
|
||
goto error_out;
|
||
}
|
||
}
|
||
|
||
/* Tell the OS ... */
|
||
if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
|
||
*out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
|
||
|
||
/* Cleanup bad HDEC or TB on all threads or subcures before we clear
|
||
* the error conditions
|
||
*/
|
||
tfmr_cleanup_core_errors(tfmr);
|
||
|
||
/* Unlock before next rendez-vous */
|
||
unlock(&hmi_lock);
|
||
|
||
/* Second rendez vous, ensure the above cleanups are all done before
|
||
* we proceed further
|
||
*/
|
||
hmi_rendez_vous(2);
|
||
|
||
/* We can now clear the error conditions in the core. */
|
||
recover = tfmr_clear_core_errors(tfmr);
|
||
if (recover == 0)
|
||
goto error_out;
|
||
|
||
/* Third rendez-vous. We could in theory do the timebase resync as
|
||
* part of the previous one, but I prefer having all the error
|
||
* conditions cleared before we start trying.
|
||
*/
|
||
hmi_rendez_vous(3);
|
||
|
||
/* Now perform the actual TB recovery on thread 0 */
|
||
if (t == t0)
|
||
recover = chiptod_recover_tb_errors(&this_cpu()->tb_resynced);
|
||
|
||
error_out:
|
||
/* Last rendez-vous */
|
||
hmi_rendez_vous(4);
|
||
|
||
/* Now all threads have gone past rendez-vous 3 and not yet past another
|
||
* rendez-vous 1, so the value of tb_resynced of thread 0 of the core
|
||
* contains an accurate indication as to whether the timebase was lost.
|
||
*/
|
||
if (t0->tb_resynced)
|
||
*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
|
||
|
||
if (t == t0 && hmi_jobs) {
|
||
int i;
|
||
for (i = 1; i < cpu_thread_count; i++)
|
||
if (hmi_jobs[i])
|
||
cpu_wait_job(hmi_jobs[i], true);
|
||
free(hmi_jobs);
|
||
}
|
||
|
||
return recover;
|
||
}
|
||
|
||
static uint64_t read_tfmr_t0(void)
|
||
{
|
||
uint64_t tfmr_t0;
|
||
uint32_t chip_id = this_cpu()->chip_id;
|
||
uint32_t core_id = pir_to_core_id(this_cpu()->pir);
|
||
|
||
lock(&hmi_lock);
|
||
|
||
xscom_write(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRC),
|
||
SETFIELD(P9_SCOMC_SPR_SELECT, 0, P9_SCOMC_TFMR_T0));
|
||
xscom_read(chip_id, XSCOM_ADDR_P9_EC(core_id, P9_SCOM_SPRD),
|
||
&tfmr_t0);
|
||
unlock(&hmi_lock);
|
||
return tfmr_t0;
|
||
}
|
||
|
||
/* P9 errata: In theory, an HDEC error is sent to all threads. However,
|
||
* due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be
|
||
* cleared on thread 1..3, I am not confident we can do a rendez-vous
|
||
* in all cases.
|
||
*
|
||
* Our current approach is to ignore that error unless it is present
|
||
* on thread 0 TFMR. Also, ignore TB residue error due to a similar
|
||
* errata as above.
|
||
*/
|
||
static void validate_latched_errors(uint64_t *tfmr)
|
||
{
|
||
if ((*tfmr & (SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TB_RESIDUE_ERR))
|
||
&& this_cpu()->is_secondary) {
|
||
uint64_t tfmr_t0 = read_tfmr_t0();
|
||
|
||
if (!(tfmr_t0 & SPR_TFMR_HDEC_PARITY_ERROR))
|
||
*tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
|
||
|
||
if (!(tfmr_t0 & SPR_TFMR_TB_RESIDUE_ERR))
|
||
*tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR;
|
||
}
|
||
}
|
||
|
||
static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
|
||
{
|
||
int recover = -1;
|
||
uint64_t tfmr = mfspr(SPR_TFMR);
|
||
|
||
/* Initialize the hmi event with old value of TFMR */
|
||
hmi_evt->tfmr = cpu_to_be64(tfmr);
|
||
|
||
/* A TFMR parity/corrupt error makes us ignore all the local stuff.*/
|
||
if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
|
||
/* Mark TB as invalid for now as we don't trust TFMR, we'll fix
|
||
* it up later
|
||
*/
|
||
this_cpu()->tb_invalid = true;
|
||
goto bad_tfmr;
|
||
}
|
||
|
||
this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID);
|
||
|
||
if (proc_gen == proc_gen_p9)
|
||
validate_latched_errors(&tfmr);
|
||
|
||
/* First, handle thread local errors */
|
||
if (tfmr & SPR_TFMR_THREAD_ERRORS) {
|
||
recover = handle_thread_tfac_error(tfmr, out_flags);
|
||
tfmr &= ~SPR_TFMR_THREAD_ERRORS;
|
||
}
|
||
|
||
bad_tfmr:
|
||
|
||
/* Let's see if we still have a all-core error to deal with, if
|
||
* not, we just bail out
|
||
*/
|
||
if (tfmr & SPR_TFMR_CORE_ERRORS) {
|
||
int recover2;
|
||
|
||
/* Only update "recover" if it's not already 0 (non-recovered)
|
||
*/
|
||
recover2 = handle_all_core_tfac_error(tfmr, out_flags);
|
||
if (recover != 0)
|
||
recover = recover2;
|
||
} else if (tfmr & SPR_TFMR_CHIP_TOD_INTERRUPT) {
|
||
int recover2;
|
||
|
||
/*
|
||
* There are some TOD errors which do not affect working of
|
||
* TOD and TB. They stay in valid state. Hence we don't need
|
||
* rendez vous.
|
||
*
|
||
* TOD errors that affects TOD/TB will report a global error
|
||
* on TFMR alongwith bit 51, and they will go in rendez vous.
|
||
*/
|
||
recover2 = chiptod_recover_tod_errors();
|
||
if (recover != 0)
|
||
recover = recover2;
|
||
} else if (this_cpu()->tb_invalid) {
|
||
int recover2;
|
||
|
||
/*
|
||
* This shouldn't happen, TB is invalid and no global error was
|
||
* reported. However, On p10, in a very rare situation when
|
||
* core is waking up from stop2 or higher stop state, timer
|
||
* facility goes into error state due to Missing step, causing
|
||
* an HMI with no error reason set in TFMR register other than
|
||
* TFMR[41]=0 (tb_valid) and TFMR[28:31]=9 (tbst_encoded).
|
||
* Ideally, "Missing step" error should be reported in
|
||
* TFMR[44]=1. It looks like in this rare case, while
|
||
* generating HMI, HW fails to sync up the TFMR register with
|
||
* the core which is waking up from stop2.
|
||
*
|
||
* To be able to recover, follow down to recovery method as if
|
||
* we got core level TB error and treat this as TFMR corrupt
|
||
* error and reset all core errors including Missing step.
|
||
*/
|
||
|
||
prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
|
||
"CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
|
||
mfspr(SPR_TFMR));
|
||
recover2 = handle_all_core_tfac_error(tfmr, out_flags);
|
||
if (recover != 0)
|
||
recover = recover2;
|
||
}
|
||
|
||
if (recover != -1 && hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
|
||
hmi_evt->type = OpalHMI_ERROR_TFAC;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
|
||
/* Set the TB state looking at TFMR register before we head out. */
|
||
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
|
||
|
||
if (this_cpu()->tb_invalid) {
|
||
*out_flags |= OPAL_HMI_FLAGS_TOD_TB_FAIL;
|
||
prlog(PR_WARNING, "Failed to get TB in running state! "
|
||
"CPU=%x, TFMR=%016lx\n", this_cpu()->pir,
|
||
mfspr(SPR_TFMR));
|
||
}
|
||
|
||
return recover;
|
||
}
|
||
|
||
static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
|
||
uint64_t *out_flags)
|
||
{
|
||
struct cpu_thread *cpu = this_cpu();
|
||
int recover = 1;
|
||
uint64_t handled = 0;
|
||
|
||
prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer);
|
||
/* Initialize the hmi event with old value of HMER */
|
||
if (hmi_evt)
|
||
hmi_evt->hmer = cpu_to_be64(hmer);
|
||
|
||
/* Handle Timer/TOD errors separately */
|
||
if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
|
||
hmi_print_debug("Timer Facility Error", hmer);
|
||
handled = hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR);
|
||
mtspr(SPR_HMER, ~handled);
|
||
recover = handle_tfac_errors(hmi_evt, out_flags);
|
||
handled = 0;
|
||
}
|
||
|
||
lock(&hmi_lock);
|
||
/*
|
||
* Not all HMIs would move TB into invalid state. Set the TB state
|
||
* looking at TFMR register. TFMR will tell us correct state of
|
||
* TB register.
|
||
*/
|
||
if (hmer & SPR_HMER_PROC_RECV_DONE) {
|
||
uint32_t chip_id = pir_to_chip_id(cpu->pir);
|
||
uint32_t core_id = pir_to_core_id(cpu->pir);
|
||
uint64_t core_wof;
|
||
|
||
hmi_print_debug("Processor recovery occurred.", hmer);
|
||
if (!read_core_wof(chip_id, core_id, &core_wof)) {
|
||
int i;
|
||
|
||
prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
|
||
if (proc_gen <= proc_gen_p9) {
|
||
for (i = 0; i < ARRAY_SIZE(p9_recoverable_bits); i++) {
|
||
if (core_wof & PPC_BIT(p9_recoverable_bits[i].bit))
|
||
prlog(PR_DEBUG, " %s\n", p9_recoverable_bits[i].reason);
|
||
}
|
||
} else if (proc_gen == proc_gen_p10 || proc_gen == proc_gen_p11) {
|
||
for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
|
||
if (core_wof & PPC_BIT(p10_core_fir_bits[i].bit))
|
||
prlog(PR_DEBUG, " %s\n", p10_core_fir_bits[i].reason);
|
||
}
|
||
}
|
||
}
|
||
|
||
handled |= SPR_HMER_PROC_RECV_DONE;
|
||
if (cpu_is_thread0(cpu) && hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
}
|
||
|
||
if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED)) {
|
||
handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
|
||
if (cpu_is_thread0(cpu) && hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
hmi_print_debug("Processor recovery Done (masked).", hmer);
|
||
}
|
||
|
||
if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
|
||
handled |= SPR_HMER_PROC_RECV_AGAIN;
|
||
if (cpu_is_thread0(cpu) && hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
hmi_print_debug("Processor recovery occurred again before"
|
||
"bit2 was cleared\n", hmer);
|
||
}
|
||
|
||
/* XXX: what to do with this? */
|
||
if (hmer & SPR_HMER_SPURR_SCALE_LIMIT) {
|
||
handled |= SPR_HMER_SPURR_SCALE_LIMIT;
|
||
if (cpu_is_thread0(cpu) && hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
hmi_print_debug("Turbo versus nominal frequency exceeded limit.", hmer);
|
||
}
|
||
|
||
/* Assert if we see malfunction alert, we can not continue. */
|
||
if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
|
||
handled |= SPR_HMER_MALFUNCTION_ALERT;
|
||
|
||
hmi_print_debug("Malfunction Alert", hmer);
|
||
recover = 0;
|
||
if (hmi_evt)
|
||
decode_malfunction(hmi_evt, out_flags);
|
||
}
|
||
|
||
/* Assert if we see Hypervisor resource error, we can not continue. */
|
||
if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_HYP_RESOURCE_ERR)) {
|
||
handled |= SPR_HMER_HYP_RESOURCE_ERR;
|
||
|
||
hmi_print_debug("Hypervisor resource error", hmer);
|
||
recover = 0;
|
||
if (hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_FATAL;
|
||
hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
}
|
||
|
||
/* XXX: what to do with this? */
|
||
if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND)) {
|
||
handled |= SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
|
||
hmer &= ~SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
|
||
|
||
hmi_print_debug("Attempted to wake thread when threads in TM suspend mode.", hmer);
|
||
if (hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE,
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
}
|
||
|
||
if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_TRIG_FIR_HMI)) {
|
||
handled |= SPR_HMER_TRIG_FIR_HMI;
|
||
hmer &= ~SPR_HMER_TRIG_FIR_HMI;
|
||
|
||
hmi_print_debug("Clearing unknown debug trigger", hmer);
|
||
if (hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR,
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
}
|
||
if ((proc_gen == proc_gen_p10 || proc_gen == proc_gen_p11)
|
||
&& (hmer & SPR_HMER_P10_TRIG_FIR_HMI)) {
|
||
handled |= SPR_HMER_P10_TRIG_FIR_HMI;
|
||
hmer &= ~SPR_HMER_P10_TRIG_FIR_HMI;
|
||
|
||
hmi_print_debug("Clearing unknown debug trigger", hmer);
|
||
if (hmi_evt) {
|
||
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
|
||
hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR,
|
||
queue_hmi_event(hmi_evt, recover, out_flags);
|
||
}
|
||
}
|
||
|
||
if (recover == 0)
|
||
disable_fast_reboot("Unrecoverable HMI");
|
||
/*
|
||
* HMER bits are sticky, once set to 1 they remain set to 1 until
|
||
* they are set to 0. Reset the error source bit to 0, otherwise
|
||
* we keep getting HMI interrupt again and again. Writing to HMER
|
||
* acts as an AND, so we write mask of all 1's except for the bits
|
||
* we want to clear.
|
||
*/
|
||
mtspr(SPR_HMER, ~handled);
|
||
unlock(&hmi_lock);
|
||
return recover;
|
||
}
|
||
|
||
static int64_t opal_handle_hmi(void)
|
||
{
|
||
uint64_t hmer, dummy_flags;
|
||
struct OpalHMIEvent hmi_evt;
|
||
|
||
/*
|
||
* Compiled time check to see size of OpalHMIEvent do not exceed
|
||
* that of struct opal_msg.
|
||
*/
|
||
BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
|
||
|
||
memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
|
||
hmi_evt.version = OpalHMIEvt_V2;
|
||
|
||
hmer = mfspr(SPR_HMER); /* Get HMER register value */
|
||
handle_hmi_exception(hmer, &hmi_evt, &dummy_flags);
|
||
|
||
return OPAL_SUCCESS;
|
||
}
|
||
opal_call(OPAL_HANDLE_HMI, opal_handle_hmi, 0);
|
||
|
||
static int64_t opal_handle_hmi2(__be64 *out_flags)
|
||
{
|
||
uint64_t hmer, flags = 0;
|
||
struct OpalHMIEvent hmi_evt;
|
||
|
||
/*
|
||
* Compiled time check to see size of OpalHMIEvent do not exceed
|
||
* that of struct opal_msg.
|
||
*/
|
||
BUILD_ASSERT(sizeof(struct opal_msg) >= sizeof(struct OpalHMIEvent));
|
||
|
||
memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
|
||
hmi_evt.version = OpalHMIEvt_V2;
|
||
|
||
hmer = mfspr(SPR_HMER); /* Get HMER register value */
|
||
handle_hmi_exception(hmer, &hmi_evt, &flags);
|
||
*out_flags = cpu_to_be64(flags);
|
||
|
||
return OPAL_SUCCESS;
|
||
}
|
||
opal_call(OPAL_HANDLE_HMI2, opal_handle_hmi2, 1);
|