On 2024-03-13 5:41, Lijo Lazar wrote:
Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.

Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++
  1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void 
amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
} +static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+       struct amdgpu_device *tmp_adev;
+       int ret = 0;
+       u32 status;
+
+       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+               pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+               if (PCI_POSSIBLE_ERROR(status)) {
+                       dev_err(tmp_adev->dev, "device lost from bus!");
+                       ret = -ENODEV;

You could just return here. What's the point of looking for other devices if you're going to return an error anyway?

Regards,
  Felix


+               }
+       }
+
+       return ret;
+}
+
  /**
   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
   *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                device_list_handle = &device_list;
        }
+ if (!amdgpu_sriov_vf(adev)) {
+               r = amdgpu_device_health_check(device_list_handle);
+               if (r)
+                       goto end_reset;
+       }
+
        /* We need to lock reset domain only once both for XGMI and single 
device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                                            reset_list);
        amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+end_reset:
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);

Reply via email to