From 50ba0232fd5312410f1b65247e774244f89a628e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 20:50:36 +0200 Subject: Merging upstream version 6.8.9. Signed-off-by: Daniel Baumann --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 146 ++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7f48c7ec41..d0afb9ba37 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -162,6 +162,65 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, static DEVICE_ATTR(pcie_replay_count, 0444, amdgpu_device_get_pcie_replay_count, NULL); +static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t ppos, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + ssize_t bytes_read; + + switch (ppos) { + case AMDGPU_SYS_REG_STATE_XGMI: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); + break; + case AMDGPU_SYS_REG_STATE_WAFL: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); + break; + case AMDGPU_SYS_REG_STATE_PCIE: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); + break; + case AMDGPU_SYS_REG_STATE_USR: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); + break; + case AMDGPU_SYS_REG_STATE_USR_1: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); + break; + default: + return -EINVAL; + } + + return bytes_read; +} + +BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, + AMDGPU_SYS_REG_STATE_END); + +int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) +{ + int ret; + + if (!amdgpu_asic_get_reg_state_supported(adev)) + return 0; + + ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); + + return ret; +} + +void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) +{ + if (!amdgpu_asic_get_reg_state_supported(adev)) + return; + sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); +} + /** * DOC: board_info * @@ -1541,7 +1600,7 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) if (adev->mman.keep_stolen_vga_memory) return false; - return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); + return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); } /* @@ -1552,11 +1611,15 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 */ -static bool amdgpu_device_pcie_dynamic_switching_supported(void) +static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) { #if IS_ENABLED(CONFIG_X86) struct cpuinfo_x86 *c = &cpu_data(0); + /* eGPU change speeds based on USB4 fabric conditions */ + if (dev_is_removable(adev->dev)) + return true; + if (c->x86_vendor == X86_VENDOR_INTEL) return false; #endif @@ -2389,7 +2452,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; - if (!amdgpu_device_pcie_dynamic_switching_supported()) + if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; total = true; @@ -2567,7 +2630,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) break; } - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT, ring->num_hw_submission, 0, timeout, adev->reset_domain->wq, @@ -2670,6 +2733,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) goto init_failed; } } + + r = amdgpu_seq64_init(adev); + if (r) { + DRM_ERROR("allocate seq64 failed %d\n", r); + goto init_failed; + } } } @@ -3132,6 +3201,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) amdgpu_device_wb_fini(adev); amdgpu_device_mem_scratch_fini(adev); amdgpu_ib_pool_fini(adev); + amdgpu_seq64_fini(adev); } r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); @@ -4202,6 +4272,7 @@ fence_driver_init: "Could not create amdgpu board attributes\n"); amdgpu_fru_sysfs_init(adev); + amdgpu_reg_state_sysfs_init(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); @@ -4324,6 +4395,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); amdgpu_fru_sysfs_fini(adev); + amdgpu_reg_state_sysfs_fini(adev); + /* disable ras feature must before hw fini */ amdgpu_ras_pre_fini(adev); @@ -4451,6 +4524,8 @@ int amdgpu_device_prepare(struct drm_device *dev) if (r) goto unprepare; + flush_delayed_work(&adev->gfx.gfx_off_delay_work); + for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid) continue; @@ -4954,7 +5029,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; spin_lock(&ring->sched.job_list_lock); @@ -5093,7 +5168,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; /* Clear job fence from fence drv to avoid force_completion @@ -5560,7 +5635,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; drm_sched_stop(&ring->sched, job ? &job->base : NULL); @@ -5629,7 +5704,7 @@ skip_hw_reset: for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; drm_sched_start(&ring->sched, true); @@ -5690,6 +5765,39 @@ skip_sched_resume: return r; } +/** + * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner + * + * @adev: amdgpu_device pointer + * @speed: pointer to the speed of the link + * @width: pointer to the width of the link + * + * Evaluate the hierarchy to find the speed and bandwidth capabilities of the + * first physical partner to an AMD dGPU. + * This will exclude any virtual switches and links. + */ +static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + struct pci_dev *parent = adev->pdev; + + if (!speed || !width) + return; + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + while ((parent = pci_upstream_bridge(parent))) { + /* skip upstream/downstream switches internal to dGPU*/ + if (parent->vendor == PCI_VENDOR_ID_ATI) + continue; + *speed = pcie_get_speed_cap(parent); + *width = pcie_get_width_cap(parent); + break; + } +} + /** * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot * @@ -5723,8 +5831,8 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) return; - pcie_bandwidth_available(adev->pdev, NULL, - &platform_speed_cap, &platform_link_width); + amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, + &platform_link_width); if (adev->pm.pcie_gen_mask == 0) { /* asic caps */ @@ -5951,7 +6059,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; drm_sched_stop(&ring->sched, NULL); @@ -6001,6 +6109,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; + struct amdgpu_hive_info *hive; + int hive_ras_recovery = 0; + struct amdgpu_ras *ras; + + /* PCI error slot reset should be skipped During RAS recovery */ + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + ras = amdgpu_ras_get_context(adev); + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && + ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + return PCI_ERS_RESULT_RECOVERED; DRM_INFO("PCI error: slot reset callback!!\n"); @@ -6079,7 +6201,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!amdgpu_ring_sched_ready(ring)) continue; drm_sched_start(&ring->sched, true); -- cgit v1.2.3