Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout

Andrey Grodzovsky Wed, 20 Jan 2021 08:40:05 -0800


On 1/20/21 9:12 AM, Horace Chen wrote:

Fix a racing issue when jobs on 2 rings timeout simultaneously.

If 2 rings timed out at the same time, the
amdgpu_device_gpu_recover will be reentered. Then the
adev->gmc.xgmi.head will be grabbed by 2 local linked list,
which may cause wild pointer issue in iterating.

lock the device earily to prevent the node be added to 2
different lists.

also increase karma for the skipped job since the job is also
timed out and should be guilty.

Signed-off-by: Horace Chen <horace.c...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 70 +++++++++++++++++++---
  1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4d434803fb49..d59d3182ac2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4460,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
        up_write(&adev->reset_sem);
  }

+/*

+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be same as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static bool amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct 
amdgpu_hive_info *hive)
+{
+       struct amdgpu_device *tmp_adev = NULL;
+
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive) {
+                       dev_err(adev->dev, "Hive is NULL while device has multiple 
xgmi nodes");
+                       return false;
+               }
+               list_for_each_entry(tmp_adev, &hive->device_list, 
gmc.xgmi.head) {
+                       if (!amdgpu_device_lock_adev(tmp_adev, hive))
+                               goto roll_back;
+               }
+               return true;
+       } else {
+               return amdgpu_device_lock_adev(adev, hive);
+       }
+roll_back:
+       if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+               /*
+                * if the lockup iteration break in the middle of a hive,
+                * it may means there may has a race issue,
+                * or a hive device locked up independently.
+                * we may be in trouble and may not,
+                * so will try to roll back the lock and give out a warnning.
+                */
+               dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. 
Rolling back to unlock");
+               list_for_each_entry_continue_reverse(tmp_adev, 
&hive->device_list, gmc.xgmi.head) {
+                       amdgpu_device_unlock_adev(tmp_adev);
+               }
+       }
+       return false;
+}
+
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
        struct pci_dev *p = NULL;
@@ -4573,11 +4613,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
                        DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
another already in progress",
                                job ? job->base.id : -1, hive->hive_id);
                        amdgpu_put_xgmi_hive(hive);
+                       if (job)
+                               drm_sched_increase_karma(&job->base);
                        return 0;
                }
                mutex_lock(&hive->hive_lock);
        }

+ /*

+        * lock the device before we try to operate the linked list
+        * if didn't get the device lock, don't touch the linked list since
+        * others may iterating it.
+        */
+       if (!amdgpu_device_lock_hive_adev(adev, hive)) {
+               dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another 
already in progress",
+                                       job ? job->base.id : -1);
+
+               if (adev->gmc.xgmi.num_physical_nodes > 1 && !hive)
+                       r = -ENODEV;
+               else
+                       r = 0;



You can just change amdgpu_device_lock_hive_adev return type to int instead
of code duplication, maybe returning EAGAIN for actual locking failure.

Andrey

+               /* even we skipped this reset, still need to set the job to 
guilty */
+               if (job)
+                       drm_sched_increase_karma(&job->base);
+               goto skip_recovery;
+       }
+
        /*
         * Build list of devices to reset.
         * In case we are in XGMI hive mode, resort the device list
@@ -4585,8 +4646,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         */
        INIT_LIST_HEAD(&device_list);
        if (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive)
-                       return -ENODEV;
                if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
                        list_rotate_to_front(&adev->gmc.xgmi.head, 
&hive->device_list);
                device_list_handle = &hive->device_list;
@@ -4597,13 +4656,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

/* block all schedulers and reset given job's ring */

        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-                       dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as 
another already in progress",
-                                 job ? job->base.id : -1);
-                       r = 0;
-                       goto skip_recovery;
-               }
-
                /*
                 * Try to put the audio codec into suspend state
                 * before gpu reset started.

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout

Reply via email to