summaryrefslogtreecommitdiffstats
path: root/drivers/accel/habanalabs/common
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/habanalabs/common')
-rw-r--r--drivers/accel/habanalabs/common/device.c8
-rw-r--r--drivers/accel/habanalabs/common/firmware_if.c123
-rw-r--r--drivers/accel/habanalabs/common/habanalabs.h15
-rw-r--r--drivers/accel/habanalabs/common/habanalabs_drv.c34
-rw-r--r--drivers/accel/habanalabs/common/habanalabs_ioctl.c53
-rw-r--r--drivers/accel/habanalabs/common/hwmon.c4
-rw-r--r--drivers/accel/habanalabs/common/memory.c7
-rw-r--r--drivers/accel/habanalabs/common/sysfs.c39
8 files changed, 187 insertions, 96 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 83821e1757..a73bd4be94 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1051,10 +1051,12 @@ static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
if (!prop->cpucp_info.eq_health_check_supported)
return 0;
- if (hdev->eq_heartbeat_received)
+ if (hdev->eq_heartbeat_received) {
hdev->eq_heartbeat_received = false;
- else
+ } else {
+ dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
return -EIO;
+ }
return 0;
}
@@ -2038,7 +2040,7 @@ device_reset:
if (ctx)
hl_ctx_put(ctx);
- return hl_device_reset(hdev, flags);
+ return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD);
}
static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 47e8384134..3558a6a8e1 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -646,39 +646,27 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
return rc;
}
-static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
- u32 sts_val)
+static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val)
{
bool err_exists = false;
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return false;
- if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - DRAM initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
+ dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n");
- if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
+ if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - Thermal Sensor initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
+ dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
if (hdev->bmc_enable) {
- dev_err(hdev->dev,
- "Device boot error - Skipped waiting for BMC\n");
- err_exists = true;
+ dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n");
} else {
- dev_info(hdev->dev,
- "Device boot message - Skipped waiting for BMC\n");
+ dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n");
/* This is an info so we don't want it to disable the
* device
*/
@@ -686,48 +674,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
}
}
- if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
- dev_err(hdev->dev,
- "Device boot error - Serdes data from BMC not available\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
+ dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n");
- if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - NIC F/W initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
+ dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n");
- if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
- dev_err(hdev->dev,
- "Device boot warning - security not ready\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
+ dev_err(hdev->dev, "Device boot warning - security not ready\n");
- if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
dev_err(hdev->dev, "Device boot error - security failure\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL)
dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
dev_err(hdev->dev, "Device boot error - PLL failure\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL)
dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n");
- err_exists = true;
- }
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
/* Ignore this bit, don't prevent driver loading */
@@ -735,52 +704,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL;
}
- if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_BINNING_FAIL)
dev_err(hdev->dev, "Device boot error - binning failure\n");
- err_exists = true;
- }
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
+ if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
+ dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n");
+
+ if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
+ dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n");
+
+ /* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
err_exists = true;
}
- /* All warnings should go here in order not to reach the unknown error validation */
- if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
- dev_warn(hdev->dev,
- "Device boot warning - Skipped DRAM initialization\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
- }
+ if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL)
+ dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n");
- if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
- dev_warn(hdev->dev,
- "Device boot warning - Failed to load preboot primary image\n");
- /* This is a warning so we don't want it to disable the
- * device as we have a secondary preboot image
- */
- err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
- }
-
- if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
- dev_warn(hdev->dev,
- "Device boot warning - TPM failure\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
- }
+ if (err_val & CPU_BOOT_ERR0_TPM_FAIL)
+ dev_warn(hdev->dev, "Device boot warning - TPM failure\n");
- if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
- dev_err(hdev->dev,
- "Device boot error - unknown ERR0 error 0x%08x\n", err_val);
+ if (err_val & CPU_BOOT_ERR_FATAL_MASK)
err_exists = true;
- }
/* return error only if it's in the predefined mask */
if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
@@ -3295,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}
+int hl_fw_get_dev_info_signed(struct hl_device *hdev,
+ struct cpucp_dev_info_signed *dev_info_signed, u32 nonce)
+{
+ return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed,
+ sizeof(struct cpucp_dev_info_signed), nonce,
+ HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
+}
+
int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
dma_addr_t buff, u32 *size)
{
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index d0fd77bb6a..41c7aac2ff 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -2547,7 +2547,7 @@ struct hl_state_dump_specs {
* DEVICES
*/
-#define HL_STR_MAX 32
+#define HL_STR_MAX 64
#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
@@ -3521,6 +3521,9 @@ struct hl_device {
u8 heartbeat;
};
+/* Retrieve PCI device name in case of a PCI device or dev name in simulator */
+#define HL_DEV_NAME(hdev) \
+ ((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE")
/**
* struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
@@ -3596,6 +3599,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major
return false;
}
+static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
+ u32 fw_sw_minor)
+{
+ return (hdev->fw_sw_major_ver > fw_sw_major ||
+ (hdev->fw_sw_major_ver == fw_sw_major &&
+ hdev->fw_sw_minor_ver >= fw_sw_minor));
+}
+
/*
* Kernel module functions that can be accessed by entire module
*/
@@ -3956,6 +3967,8 @@ long hl_fw_get_max_power(struct hl_device *hdev);
void hl_fw_set_max_power(struct hl_device *hdev);
int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
u32 nonce);
+int hl_fw_get_dev_info_signed(struct hl_device *hdev,
+ struct cpucp_dev_info_signed *dev_info_signed, u32 nonce);
int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value);
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index 51fb04bbe3..e542fd40e1 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -673,6 +673,38 @@ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
return PCI_ERS_RESULT_RECOVERED;
}
+static void hl_pci_reset_prepare(struct pci_dev *pdev)
+{
+ struct hl_device *hdev;
+
+ hdev = pci_get_drvdata(pdev);
+ if (!hdev)
+ return;
+
+ hdev->disabled = true;
+}
+
+static void hl_pci_reset_done(struct pci_dev *pdev)
+{
+ struct hl_device *hdev;
+ u32 flags;
+
+ hdev = pci_get_drvdata(pdev);
+ if (!hdev)
+ return;
+
+ /*
+ * Schedule a thread to trigger hard reset.
+ * The reason for this handler, is for rare cases where the driver is up
+ * and FLR occurs. This is valid only when working with no VM, so FW handles FLR
+ * and resets the device. FW will go back preboot stage, so driver needs to perform
+ * hard reset in order to load FW fit again.
+ */
+ flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
+
+ hl_device_reset(hdev, flags);
+}
+
static const struct dev_pm_ops hl_pm_ops = {
.suspend = hl_pmops_suspend,
.resume = hl_pmops_resume,
@@ -682,6 +714,8 @@ static const struct pci_error_handlers hl_pci_err_handler = {
.error_detected = hl_pci_err_detected,
.slot_reset = hl_pci_err_slot_reset,
.resume = hl_pci_err_resume,
+ .reset_prepare = hl_pci_reset_prepare,
+ .reset_done = hl_pci_reset_done,
};
static struct pci_driver hl_pci_driver = {
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index a7cd625d82..1dd6e23172 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -19,6 +19,9 @@
#include <asm/msr.h>
+/* make sure there is space for all the signed info */
+static_assert(sizeof(struct cpucp_info) <= SEC_DEV_INFO_BUF_SZ);
+
static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
[HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr),
[HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf),
@@ -719,6 +722,53 @@ free_sec_attest_info:
return rc;
}
+static int dev_info_signed(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+ struct cpucp_dev_info_signed *dev_info_signed;
+ struct hl_info_signed *info;
+ u32 max_size = args->return_size;
+ int rc;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ dev_info_signed = kzalloc(sizeof(*dev_info_signed), GFP_KERNEL);
+ if (!dev_info_signed)
+ return -ENOMEM;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ rc = -ENOMEM;
+ goto free_dev_info_signed;
+ }
+
+ rc = hl_fw_get_dev_info_signed(hpriv->hdev,
+ dev_info_signed, args->sec_attest_nonce);
+ if (rc)
+ goto free_info;
+
+ info->nonce = le32_to_cpu(dev_info_signed->nonce);
+ info->info_sig_len = dev_info_signed->info_sig_len;
+ info->pub_data_len = le16_to_cpu(dev_info_signed->pub_data_len);
+ info->certificate_len = le16_to_cpu(dev_info_signed->certificate_len);
+ info->dev_info_len = sizeof(struct cpucp_info);
+ memcpy(&info->info_sig, &dev_info_signed->info_sig, sizeof(info->info_sig));
+ memcpy(&info->public_data, &dev_info_signed->public_data, sizeof(info->public_data));
+ memcpy(&info->certificate, &dev_info_signed->certificate, sizeof(info->certificate));
+ memcpy(&info->dev_info, &dev_info_signed->info, info->dev_info_len);
+
+ rc = copy_to_user(out, info, min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0;
+
+free_info:
+ kfree(info);
+free_dev_info_signed:
+ kfree(dev_info_signed);
+
+ return rc;
+}
+
+
static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
int rc;
@@ -1089,6 +1139,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_FW_GENERIC_REQ:
return send_fw_generic_request(hdev, args);
+ case HL_INFO_DEV_SIGNED:
+ return dev_info_signed(hpriv, args);
+
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -EINVAL;
diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 8598056216..1ee2ee07e9 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -578,10 +578,6 @@ int hl_get_temperature(struct hl_device *hdev,
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
-
- dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n",
- pkt.ctl, pkt.sensor_index, pkt.type);
-
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, &result);
diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index 0b8689fe0b..3348ad12c2 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -955,8 +955,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
(i + 1) == phys_pg_pack->npages);
if (rc) {
dev_err(hdev->dev,
- "map failed for handle %u, npages: %llu, mapped: %llu",
- phys_pg_pack->handle, phys_pg_pack->npages,
+ "map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",
+ rc, phys_pg_pack->handle, phys_pg_pack->npages,
mapped_pg_cnt);
goto err;
}
@@ -1186,7 +1186,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
if (rc) {
- dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle);
+ dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",
+ rc, handle);
mutex_unlock(&hdev->mmu_lock);
goto map_err;
}
diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index 2786063730..8a9f988321 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -8,6 +8,7 @@
#include "habanalabs.h"
#include <linux/pci.h>
+#include <linux/types.h>
static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf)
{
@@ -80,12 +81,27 @@ static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, c
{
struct hl_device *hdev = dev_get_drvdata(dev);
struct cpucp_info *cpucp_info;
+ u32 infineon_second_stage_version;
+ u32 infineon_second_stage_first_instance;
+ u32 infineon_second_stage_second_instance;
+ u32 infineon_second_stage_third_instance;
+ u32 mask = 0xff;
cpucp_info = &hdev->asic_prop.cpucp_info;
+ infineon_second_stage_version = le32_to_cpu(cpucp_info->infineon_second_stage_version);
+ infineon_second_stage_first_instance = infineon_second_stage_version & mask;
+ infineon_second_stage_second_instance =
+ (infineon_second_stage_version >> 8) & mask;
+ infineon_second_stage_third_instance =
+ (infineon_second_stage_version >> 16) & mask;
+
if (cpucp_info->infineon_second_stage_version)
- return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version),
- le32_to_cpu(cpucp_info->infineon_second_stage_version));
+ return sprintf(buf, "%#04x %#04x:%#04x:%#04x\n",
+ le32_to_cpu(cpucp_info->infineon_version),
+ infineon_second_stage_first_instance,
+ infineon_second_stage_second_instance,
+ infineon_second_stage_third_instance);
else
return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version));
}
@@ -386,6 +402,21 @@ static ssize_t security_enabled_show(struct device *dev,
return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled);
}
+static ssize_t module_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hl_device *hdev = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location));
+}
+
+static ssize_t parent_device_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct hl_device *hdev = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%s\n", HL_DEV_NAME(hdev));
+}
+
static DEVICE_ATTR_RO(armcp_kernel_ver);
static DEVICE_ATTR_RO(armcp_ver);
static DEVICE_ATTR_RO(cpld_ver);
@@ -405,6 +436,8 @@ static DEVICE_ATTR_RO(thermal_ver);
static DEVICE_ATTR_RO(uboot_ver);
static DEVICE_ATTR_RO(fw_os_ver);
static DEVICE_ATTR_RO(security_enabled);
+static DEVICE_ATTR_RO(module_id);
+static DEVICE_ATTR_RO(parent_device);
static struct bin_attribute bin_attr_eeprom = {
.attr = {.name = "eeprom", .mode = (0444)},
@@ -430,6 +463,8 @@ static struct attribute *hl_dev_attrs[] = {
&dev_attr_uboot_ver.attr,
&dev_attr_fw_os_ver.attr,
&dev_attr_security_enabled.attr,
+ &dev_attr_module_id.attr,
+ &dev_attr_parent_device.attr,
NULL,
};