CPU0:   hang_detect_work → directly calls reset_work()
CPU1:   evict_all → queues reset_work (via workqueue)

There is a possibility of two reset thread running at same time.
To avoid that we add a per queue manager flag to avoid duplication.

Signed-off-by: Sunil Khatri <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 16 ++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0a1fc45f5b4e..1440f51b667f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -109,6 +109,19 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct 
*work)
        if (!amdgpu_gpu_recovery)
                return;
 
+       /*
+        * Prevent concurrent/duplicate reset executions. Both hang_detect_work
+        * (direct call) and evict_all (via schedule+flush_work) can invoke this
+        * function simultaneously. Use an atomic test-and-set so only the first
+        * caller proceeds; the second exits early.
+        *
+        * Note: amdgpu_in_reset() cannot be used here because in_gpu_reset is
+        * only set deep inside amdgpu_device_gpu_recover(), well after we've
+        * already entered this function.
+        */
+       if (atomic_cmpxchg(&uq_mgr->reset_in_progress, 0, 1) != 0)
+               return;
+
        /*
         * Iterate through all queue types to detect and reset problematic 
queues
         * Process each queue type in the defined order
@@ -145,6 +158,8 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct 
*work)
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
        }
+
+       atomic_set(&uq_mgr->reset_in_progress, 0);
 }
 
 static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@@ -1304,6 +1319,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr 
*userq_mgr, struct drm_file *f
 
        INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
        INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
+       atomic_set(&userq_mgr->reset_in_progress, 0);
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 49b33e2d6932..2748ecc0f6c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -129,6 +129,7 @@ struct amdgpu_userq_mgr {
         * Reset work which is used when eviction fails.
         */
        struct work_struct              reset_work;
+       atomic_t                        reset_in_progress;
        atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
 };
 
-- 
2.34.1

Reply via email to