From 50ba0232fd5312410f1b65247e774244f89a628e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 20:50:36 +0200 Subject: Merging upstream version 6.8.9. Signed-off-by: Daniel Baumann --- drivers/accel/ivpu/ivpu_pm.c | 154 +++++++++++++++++++++++++++++++------------ 1 file changed, 111 insertions(+), 43 deletions(-) (limited to 'drivers/accel/ivpu/ivpu_pm.c') diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index e9b16cbc2..a15d30d09 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -13,8 +13,10 @@ #include "ivpu_drv.h" #include "ivpu_hw.h" #include "ivpu_fw.h" +#include "ivpu_fw_log.h" #include "ivpu_ipc.h" #include "ivpu_job.h" +#include "ivpu_jsm_msg.h" #include "ivpu_mmu.h" #include "ivpu_pm.h" @@ -22,6 +24,10 @@ static bool ivpu_disable_recovery; module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); MODULE_PARM_DESC(disable_recovery, "Disables recovery when VPU hang is detected"); +static unsigned long ivpu_tdr_timeout_ms; +module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); +MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); + #define PM_RESCHEDULE_LIMIT 5 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) @@ -52,11 +58,14 @@ static int ivpu_suspend(struct ivpu_device *vdev) { int ret; + /* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */ + pci_save_state(to_pci_dev(vdev->drm.dev)); + ret = ivpu_shutdown(vdev); - if (ret) { + if (ret) ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret); - return ret; - } + + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); return ret; } @@ -66,30 +75,38 @@ static int ivpu_resume(struct ivpu_device *vdev) int ret; retry: + pci_restore_state(to_pci_dev(vdev->drm.dev)); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); + ret = ivpu_hw_power_up(vdev); if (ret) { ivpu_err(vdev, "Failed to power up HW: %d\n", ret); - return ret; + goto err_power_down; } ret = ivpu_mmu_enable(vdev); if (ret) { ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); - ivpu_hw_power_down(vdev); - return ret; + goto err_power_down; } ret = ivpu_boot(vdev); - if (ret) { - ivpu_mmu_disable(vdev); - ivpu_hw_power_down(vdev); - if (!ivpu_fw_is_cold_boot(vdev)) { - ivpu_warn(vdev, "Failed to resume the FW: %d. Retrying cold boot..\n", ret); - ivpu_pm_prepare_cold_boot(vdev); - goto retry; - } else { - ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); - } + if (ret) + goto err_mmu_disable; + + return 0; + +err_mmu_disable: + ivpu_mmu_disable(vdev); +err_power_down: + ivpu_hw_power_down(vdev); + pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); + + if (!ivpu_fw_is_cold_boot(vdev)) { + ivpu_pm_prepare_cold_boot(vdev); + goto retry; + } else { + ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); } return ret; @@ -102,22 +119,37 @@ static void ivpu_pm_recovery_work(struct work_struct *work) char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; int ret; -retry: - ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev)); - if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) { - cond_resched(); - goto retry; - } + ivpu_err(vdev, "Recovering the VPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); + + ret = pm_runtime_resume_and_get(vdev->drm.dev); + if (ret) + ivpu_err(vdev, "Failed to resume VPU: %d\n", ret); + + ivpu_fw_log_dump(vdev); + + atomic_inc(&vdev->pm->reset_counter); + atomic_set(&vdev->pm->reset_pending, 1); + down_write(&vdev->pm->reset_lock); + + ivpu_suspend(vdev); + ivpu_pm_prepare_cold_boot(vdev); + ivpu_jobs_abort_all(vdev); + + ret = ivpu_resume(vdev); + if (ret) + ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); - if (ret && ret != -EAGAIN) - ivpu_err(vdev, "Failed to reset VPU: %d\n", ret); + up_write(&vdev->pm->reset_lock); + atomic_set(&vdev->pm->reset_pending, 0); kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); + pm_runtime_mark_last_busy(vdev->drm.dev); + pm_runtime_put_autosuspend(vdev->drm.dev); } -void ivpu_pm_schedule_recovery(struct ivpu_device *vdev) +void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) { - struct ivpu_pm_info *pm = vdev->pm; + ivpu_err(vdev, "Recovery triggered by %s\n", reason); if (ivpu_disable_recovery) { ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); @@ -129,13 +161,35 @@ void ivpu_pm_schedule_recovery(struct ivpu_device *vdev) return; } - /* Schedule recovery if it's not in progress */ - if (atomic_cmpxchg(&pm->in_reset, 0, 1) == 0) { - ivpu_hw_irq_disable(vdev); - queue_work(system_long_wq, &pm->recovery_work); + /* Trigger recovery if it's not in progress */ + if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { + ivpu_hw_diagnose_failure(vdev); + ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ + queue_work(system_long_wq, &vdev->pm->recovery_work); } } +static void ivpu_job_timeout_work(struct work_struct *work) +{ + struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); + struct ivpu_device *vdev = pm->vdev; + + ivpu_pm_trigger_recovery(vdev, "TDR"); +} + +void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) +{ + unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; + + /* No-op if already queued */ + queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms)); +} + +void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) +{ + cancel_delayed_work_sync(&vdev->pm->job_timeout_work); +} + int ivpu_pm_suspend_cb(struct device *dev) { struct drm_device *drm = dev_get_drvdata(dev); @@ -153,12 +207,11 @@ int ivpu_pm_suspend_cb(struct device *dev) } } + ivpu_jsm_pwr_d0i3_enter(vdev); + ivpu_suspend(vdev); ivpu_pm_prepare_warm_boot(vdev); - pci_save_state(to_pci_dev(dev)); - pci_set_power_state(to_pci_dev(dev), PCI_D3hot); - ivpu_dbg(vdev, PM, "Suspend done.\n"); return 0; @@ -172,9 +225,6 @@ int ivpu_pm_resume_cb(struct device *dev) ivpu_dbg(vdev, PM, "Resume..\n"); - pci_set_power_state(to_pci_dev(dev), PCI_D0); - pci_restore_state(to_pci_dev(dev)); - ret = ivpu_resume(vdev); if (ret) ivpu_err(vdev, "Failed to resume: %d\n", ret); @@ -188,8 +238,12 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev) { struct drm_device *drm = dev_get_drvdata(dev); struct ivpu_device *vdev = to_ivpu_device(drm); + bool hw_is_idle = true; int ret; + drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); + drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); + ivpu_dbg(vdev, PM, "Runtime suspend..\n"); if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) { @@ -200,12 +254,18 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev) return -EAGAIN; } + if (!vdev->pm->suspend_reschedule_counter) + hw_is_idle = false; + else if (ivpu_jsm_pwr_d0i3_enter(vdev)) + hw_is_idle = false; + ret = ivpu_suspend(vdev); if (ret) ivpu_err(vdev, "Failed to set suspend VPU: %d\n", ret); - if (!vdev->pm->suspend_reschedule_counter) { - ivpu_warn(vdev, "VPU failed to enter idle, force suspended.\n"); + if (!hw_is_idle) { + ivpu_err(vdev, "VPU failed to enter idle, force suspended.\n"); + ivpu_fw_log_dump(vdev); ivpu_pm_prepare_cold_boot(vdev); } else { ivpu_pm_prepare_warm_boot(vdev); @@ -266,11 +326,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) { struct ivpu_device *vdev = pci_get_drvdata(pdev); - pm_runtime_get_sync(vdev->drm.dev); - ivpu_dbg(vdev, PM, "Pre-reset..\n"); atomic_inc(&vdev->pm->reset_counter); - atomic_set(&vdev->pm->in_reset, 1); + atomic_set(&vdev->pm->reset_pending, 1); + + pm_runtime_get_sync(vdev->drm.dev); + down_write(&vdev->pm->reset_lock); ivpu_prepare_for_reset(vdev); ivpu_hw_reset(vdev); ivpu_pm_prepare_cold_boot(vdev); @@ -287,9 +348,11 @@ void ivpu_pm_reset_done_cb(struct pci_dev *pdev) ret = ivpu_resume(vdev); if (ret) ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); - atomic_set(&vdev->pm->in_reset, 0); + up_write(&vdev->pm->reset_lock); + atomic_set(&vdev->pm->reset_pending, 0); ivpu_dbg(vdev, PM, "Post-reset done.\n"); + pm_runtime_mark_last_busy(vdev->drm.dev); pm_runtime_put_autosuspend(vdev->drm.dev); } @@ -302,8 +365,12 @@ void ivpu_pm_init(struct ivpu_device *vdev) pm->vdev = vdev; pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; - atomic_set(&pm->in_reset, 0); + init_rwsem(&pm->reset_lock); + atomic_set(&pm->reset_pending, 0); + atomic_set(&pm->reset_counter, 0); + INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); + INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); if (ivpu_disable_recovery) delay = -1; @@ -318,6 +385,7 @@ void ivpu_pm_init(struct ivpu_device *vdev) void ivpu_pm_cancel_recovery(struct ivpu_device *vdev) { + drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); cancel_work_sync(&vdev->pm->recovery_work); } -- cgit v1.2.3