From dc50eab76b709d68175a358d6e23a5a3890764d3 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 19:39:57 +0200 Subject: Merging upstream version 6.7.7. Signed-off-by: Daniel Baumann --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 45 ++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ffb49b2d53..19161916ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -28,6 +28,7 @@ #include #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +#include "amdgpu_smuio.h" struct amdgpu_iv_entry; @@ -319,6 +320,12 @@ enum amdgpu_ras_ret { AMDGPU_RAS_PT, }; +enum amdgpu_ras_error_query_mode { + AMDGPU_RAS_INVALID_ERROR_QUERY = 0, + AMDGPU_RAS_DIRECT_ERROR_QUERY = 1, + AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2, +}; + /* ras error status reisger fields */ #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0 #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L @@ -389,9 +396,12 @@ struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ uint32_t features; + uint32_t schema; struct list_head head; /* sysfs */ struct device_attribute features_attr; + struct device_attribute version_attr; + struct device_attribute schema_attr; struct bin_attribute badpages_attr; struct dentry *de_ras_eeprom_table; /* block array */ @@ -430,23 +440,41 @@ struct amdgpu_ras { /* Indicates smu whether need update bad channel info */ bool update_channel_flag; + /* Record status of smu mca debug mode */ + bool is_mca_debug_mode; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; }; struct ras_fs_data { - char sysfs_name[32]; + char sysfs_name[48]; char debugfs_name[32]; }; +struct ras_err_info { + struct amdgpu_smuio_mcm_config_info mcm_info; + u64 ce_count; + u64 ue_count; +}; + +struct ras_err_node { + struct list_head node; + struct ras_err_info err_info; +}; + struct ras_err_data { unsigned long ue_count; unsigned long ce_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; + u32 err_list_count; + struct list_head err_node_list; }; +#define for_each_ras_error(err_node, err_data) \ + list_for_each_entry(err_node, &(err_data)->err_node_list, node) + struct ras_err_handler_data { /* point to bad page records array */ struct eeprom_table_record *bps; @@ -691,6 +719,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, + enum amdgpu_ras_block block); int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block); @@ -743,6 +773,11 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev); +bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, + unsigned int *mode); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); @@ -767,4 +802,12 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, const struct amdgpu_ras_err_status_reg_entry *reg_list, uint32_t reg_list_size, uint32_t instance); + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data); +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count); + #endif -- cgit v1.2.3