summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c146
1 files changed, 134 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7f48c7ec4..d0afb9ba3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -162,6 +162,65 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
static DEVICE_ATTR(pcie_replay_count, 0444,
amdgpu_device_get_pcie_replay_count, NULL);
+static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t ppos, size_t count)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(ddev);
+ ssize_t bytes_read;
+
+ switch (ppos) {
+ case AMDGPU_SYS_REG_STATE_XGMI:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_WAFL:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_PCIE:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_USR:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_USR_1:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return bytes_read;
+}
+
+BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
+ AMDGPU_SYS_REG_STATE_END);
+
+int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
+{
+ int ret;
+
+ if (!amdgpu_asic_get_reg_state_supported(adev))
+ return 0;
+
+ ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
+
+ return ret;
+}
+
+void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
+{
+ if (!amdgpu_asic_get_reg_state_supported(adev))
+ return;
+ sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
+}
+
/**
* DOC: board_info
*
@@ -1541,7 +1600,7 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
if (adev->mman.keep_stolen_vga_memory)
return false;
- return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0);
+ return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
}
/*
@@ -1552,11 +1611,15 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
* https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
* https://gitlab.freedesktop.org/drm/amd/-/issues/2663
*/
-static bool amdgpu_device_pcie_dynamic_switching_supported(void)
+static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
{
#if IS_ENABLED(CONFIG_X86)
struct cpuinfo_x86 *c = &cpu_data(0);
+ /* eGPU change speeds based on USB4 fabric conditions */
+ if (dev_is_removable(adev->dev))
+ return true;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
return false;
#endif
@@ -2389,7 +2452,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
- if (!amdgpu_device_pcie_dynamic_switching_supported())
+ if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
total = true;
@@ -2567,7 +2630,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
break;
}
- r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+ r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
DRM_SCHED_PRIORITY_COUNT,
ring->num_hw_submission, 0,
timeout, adev->reset_domain->wq,
@@ -2670,6 +2733,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
goto init_failed;
}
}
+
+ r = amdgpu_seq64_init(adev);
+ if (r) {
+ DRM_ERROR("allocate seq64 failed %d\n", r);
+ goto init_failed;
+ }
}
}
@@ -3132,6 +3201,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_device_wb_fini(adev);
amdgpu_device_mem_scratch_fini(adev);
amdgpu_ib_pool_fini(adev);
+ amdgpu_seq64_fini(adev);
}
r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
@@ -4202,6 +4272,7 @@ fence_driver_init:
"Could not create amdgpu board attributes\n");
amdgpu_fru_sysfs_init(adev);
+ amdgpu_reg_state_sysfs_init(adev);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
r = amdgpu_pmu_init(adev);
@@ -4324,6 +4395,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
amdgpu_fru_sysfs_fini(adev);
+ amdgpu_reg_state_sysfs_fini(adev);
+
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
@@ -4451,6 +4524,8 @@ int amdgpu_device_prepare(struct drm_device *dev)
if (r)
goto unprepare;
+ flush_delayed_work(&adev->gfx.gfx_off_delay_work);
+
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.valid)
continue;
@@ -4954,7 +5029,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
spin_lock(&ring->sched.job_list_lock);
@@ -5093,7 +5168,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
/* Clear job fence from fence drv to avoid force_completion
@@ -5560,7 +5635,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5629,7 +5704,7 @@ skip_hw_reset:
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_start(&ring->sched, true);
@@ -5691,6 +5766,39 @@ skip_sched_resume:
}
/**
+ * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
+ *
+ * @adev: amdgpu_device pointer
+ * @speed: pointer to the speed of the link
+ * @width: pointer to the width of the link
+ *
+ * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
+ * first physical partner to an AMD dGPU.
+ * This will exclude any virtual switches and links.
+ */
+static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
+ enum pci_bus_speed *speed,
+ enum pcie_link_width *width)
+{
+ struct pci_dev *parent = adev->pdev;
+
+ if (!speed || !width)
+ return;
+
+ *speed = PCI_SPEED_UNKNOWN;
+ *width = PCIE_LNK_WIDTH_UNKNOWN;
+
+ while ((parent = pci_upstream_bridge(parent))) {
+ /* skip upstream/downstream switches internal to dGPU*/
+ if (parent->vendor == PCI_VENDOR_ID_ATI)
+ continue;
+ *speed = pcie_get_speed_cap(parent);
+ *width = pcie_get_width_cap(parent);
+ break;
+ }
+}
+
+/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
* @adev: amdgpu_device pointer
@@ -5723,8 +5831,8 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
return;
- pcie_bandwidth_available(adev->pdev, NULL,
- &platform_speed_cap, &platform_link_width);
+ amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
+ &platform_link_width);
if (adev->pm.pcie_gen_mask == 0) {
/* asic caps */
@@ -5951,7 +6059,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_stop(&ring->sched, NULL);
@@ -6001,6 +6109,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
+ struct amdgpu_hive_info *hive;
+ int hive_ras_recovery = 0;
+ struct amdgpu_ras *ras;
+
+ /* PCI error slot reset should be skipped During RAS recovery */
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive) {
+ hive_ras_recovery = atomic_read(&hive->ras_recovery);
+ amdgpu_put_xgmi_hive(hive);
+ }
+ ras = amdgpu_ras_get_context(adev);
+ if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
+ ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
+ return PCI_ERS_RESULT_RECOVERED;
DRM_INFO("PCI error: slot reset callback!!\n");
@@ -6079,7 +6201,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_start(&ring->sched, true);