amdgpu: rework userq reset work handling

Khatri, Sunil Mon, 11 May 2026 10:59:14 -0700


On 11-05-2026 11:20 pm, Christian König wrote:

On 4/23/26 12:43, Khatri, Sunil wrote:

On 21-04-2026 06:25 pm, Christian König wrote:

It is illegal to schedule reset work from another reset work!

Fix this by scheduling the userq reset work directly on the work queue
of the reset domain.

Not fully tested, I leave that to the IGT test cases.

Signed-off-by: Christian König <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 84 +++++++++++-----------
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  | 16 ++++-
  4 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 39894e38fee4..17341e384caf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1191,7 +1191,6 @@ struct amdgpu_device {
        bool                            apu_prefer_gtt;

bool userq_halt_for_enforce_isolation;

-       struct work_struct              userq_reset_work;
        struct amdgpu_uid *uid_info;

struct amdgpu_uma_carveout_info uma_info;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b11c4b5fa8fc..cf61be17e061 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        }

INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);

-       INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);

amdgpu_coredump_init(adev);@@ -5477,7 +5476,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)

        if (!amdgpu_sriov_vf(adev))
                cancel_work(&adev->reset_work);
  #endif
-       cancel_work(&adev->userq_reset_work);
+       amdgpu_userq_mgr_cancel_reset_work(adev);

if (adev->kfd.dev)

                cancel_work(&adev->kfd.reset_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 0a4c39d83adc..ad6dac17dd21 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct 
amdgpu_device *adev,
        return false;
  }

-static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)

-{
-       if (amdgpu_device_should_recover_gpu(adev)) {
-               amdgpu_reset_domain_schedule(adev->reset_domain,
-                                            &adev->userq_reset_work);
-               /* Wait for the reset job to complete */
-               flush_work(&adev->userq_reset_work);
-       }
-}
-
-static int
-amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
  {
+       struct amdgpu_userq_mgr *uq_mgr =
+               container_of(work, struct amdgpu_userq_mgr,
+                            reset_work);
        struct amdgpu_device *adev = uq_mgr->adev;
        const int queue_types[] = {
                AMDGPU_RING_TYPE_COMPUTE,
@@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct 
amdgpu_userq_mgr *uq_mgr)
        };
        const int num_queue_types = ARRAY_SIZE(queue_types);
        bool gpu_reset = false;
-       int r = 0;
-       int i;
+       int i, r;

if (unlikely(adev->debug_disable_gpu_ring_reset)) {

                dev_err(adev->dev, "userq reset disabled by debug mask\n");
-               return 0;
+               return;
        }

/*

@@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct 
amdgpu_userq_mgr *uq_mgr)
         * skip all reset detection logic
         */
        if (!amdgpu_gpu_recovery)
-               return 0;
+               return;

/*

         * Iterate through all queue types to detect and reset problematic 
queues
@@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct 
amdgpu_userq_mgr *uq_mgr)
                }
        }

- if (gpu_reset)

-               amdgpu_userq_gpu_reset(adev);
+       if (gpu_reset) {
+               struct amdgpu_reset_context reset_context;

- return r;

+               memset(&reset_context, 0, sizeof(reset_context));
+
+               reset_context.method = AMD_RESET_METHOD_NONE;
+               reset_context.reset_req_dev = adev;
+               reset_context.src = AMDGPU_RESET_SRC_USERQ;
+               set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+               /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+               amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+       }
  }

static void amdgpu_userq_hang_detect_work(struct work_struct *work)

The function and the work handler for are using the same name and it causes 
confusion to understand.
queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
                            msecs_to_jiffies(timeout_ms)); The queued item here 
call the work item where the function name is same , so its better if we can 
keep a different name

Mhm, usually it is good practice for naming the function and the work item 
similary.

Why do you see an issue with that?

There is no issue but i got confused with hang_detect_work thenamdgpu_userq_hang_detect_work and i guess there are more similarnames... But it's ok, functionality wise it looks good to me.


Reviewed-by: Sunil Khatri <[email protected]>

Regards
Sunil Khatri


Thanks,
Christian.

Regards
Sunil Khatri

@@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct 
work_struct *work)
                container_of(work, struct amdgpu_usermode_queue,
                             hang_detect_work.work);

- amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);

+       /*
+        * Don't schedule the work here! Scheduling or queue work from one reset
+        * handler to another is illegal if you don't take extra precautions!
+        */
+       amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
  }

/*

@@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct 
amdgpu_usermode_queue *queue)
                break;
        }

- schedule_delayed_work(&queue->hang_detect_work,

-                    msecs_to_jiffies(timeout_ms));
+       queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
+                          msecs_to_jiffies(timeout_ms));
  }

void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)

@@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
        if (ret) {
                drm_file_err(uq_mgr->file,
                             "Couldn't unmap all the queues, eviction failed 
ret=%d\n", ret);
-               amdgpu_userq_detect_and_reset_queues(uq_mgr);
+               amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
+                                            &uq_mgr->reset_work);
+               flush_work(&uq_mgr->reset_work);

Flush work is called here with userq_mutex held? Is it ok to run for that long 
time and not sure about it but the flush_work might try to take the userq_mutex 
again, that was problem initially during reset.

        }
        return ret;
  }

-void amdgpu_userq_reset_work(struct work_struct *work)

-{
-       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-                                                 userq_reset_work);
-       struct amdgpu_reset_context reset_context;
-
-       memset(&reset_context, 0, sizeof(reset_context));
-
-       reset_context.method = AMD_RESET_METHOD_NONE;
-       reset_context.reset_req_dev = adev;
-       reset_context.src = AMDGPU_RESET_SRC_USERQ;
-       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-       /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
-
-       amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
  static void
  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
  {
@@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr 
*userq_mgr, struct drm_file *f
        userq_mgr->file = file_priv;

INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);

+       INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
        return 0;
  }

+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)

+{
+       struct xarray *xa = &adev->userq_doorbell_xa;
+       struct amdgpu_usermode_queue *queue;
+       unsigned long flags, queue_id;
+
+       xa_lock_irqsave(xa, flags);
+       xa_for_each(xa, queue_id, queue) {
+               cancel_delayed_work(&queue->hang_detect_work);
+               cancel_work(&queue->userq_mgr->reset_work);
+       }
+       xa_unlock_irqrestore(xa, flags);
+}
+
  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
  {
        cancel_delayed_work_sync(&userq_mgr->resume_work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 85f460e7c31b..49b33e2d6932 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
        u32                     xcp_id;
        int                     priority;
        struct dentry           *debugfs_queue;
-       struct delayed_work hang_detect_work;
+
+       /**
+        * @hang_detect_work:
+        *
+        * Delayed work which runs when userq_fences time out.
+        */
+       struct delayed_work     hang_detect_work;
        struct kref             refcount;

struct list_head userq_va_list;

@@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
        struct amdgpu_device            *adev;
        struct delayed_work             resume_work;
        struct drm_file                 *file;
+
+       /**
+        * @reset_work:
+        *
+        * Reset work which is used when eviction fails.
+        */
+       struct work_struct              reset_work;
        atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
  };

@@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp

  int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file 
*file_priv,
                          struct amdgpu_device *adev);

+void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);

  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
  void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);

Re: [PATCH 08/11] drm/amdgpu: rework userq reset work handling

Reply via email to