amdgpu: rework userq reset work handling

Christian König Mon, 11 May 2026 10:50:49 -0700

On 4/23/26 12:43, Khatri, Sunil wrote:
> 
> On 21-04-2026 06:25 pm, Christian König wrote:
>> It is illegal to schedule reset work from another reset work!
>>
>> Fix this by scheduling the userq reset work directly on the work queue
>> of the reset domain.
>>
>> Not fully tested, I leave that to the IGT test cases.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 -
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 84 +++++++++++-----------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  | 16 ++++-
>>  4 files changed, 60 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 39894e38fee4..17341e384caf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1191,7 +1191,6 @@ struct amdgpu_device {
>>      bool                            apu_prefer_gtt;
>>  
>>      bool                            userq_halt_for_enforce_isolation;
>> -    struct work_struct              userq_reset_work;
>>      struct amdgpu_uid *uid_info;
>>  
>>      struct amdgpu_uma_carveout_info uma_info;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index b11c4b5fa8fc..cf61be17e061 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>      }
>>  
>>      INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> -    INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>>  
>>      amdgpu_coredump_init(adev);
>>  
>> @@ -5477,7 +5476,7 @@ static inline void 
>> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>>      if (!amdgpu_sriov_vf(adev))
>>              cancel_work(&adev->reset_work);
>>  #endif
>> -    cancel_work(&adev->userq_reset_work);
>> +    amdgpu_userq_mgr_cancel_reset_work(adev);
>>  
>>      if (adev->kfd.dev)
>>              cancel_work(&adev->kfd.reset_work);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> index 0a4c39d83adc..ad6dac17dd21 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> @@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct 
>> amdgpu_device *adev,
>>      return false;
>>  }
>>  
>> -static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
>> -{
>> -    if (amdgpu_device_should_recover_gpu(adev)) {
>> -            amdgpu_reset_domain_schedule(adev->reset_domain,
>> -                                         &adev->userq_reset_work);
>> -            /* Wait for the reset job to complete */
>> -            flush_work(&adev->userq_reset_work);
>> -    }
>> -}
>> -
>> -static int
>> -amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>> +static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>>  {
>> +    struct amdgpu_userq_mgr *uq_mgr =
>> +            container_of(work, struct amdgpu_userq_mgr,
>> +                         reset_work);
>>      struct amdgpu_device *adev = uq_mgr->adev;
>>      const int queue_types[] = {
>>              AMDGPU_RING_TYPE_COMPUTE,
>> @@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct 
>> amdgpu_userq_mgr *uq_mgr)
>>      };
>>      const int num_queue_types = ARRAY_SIZE(queue_types);
>>      bool gpu_reset = false;
>> -    int r = 0;
>> -    int i;
>> +    int i, r;
>>  
>>      if (unlikely(adev->debug_disable_gpu_ring_reset)) {
>>              dev_err(adev->dev, "userq reset disabled by debug mask\n");
>> -            return 0;
>> +            return;
>>      }
>>  
>>      /*
>> @@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct 
>> amdgpu_userq_mgr *uq_mgr)
>>       * skip all reset detection logic
>>       */
>>      if (!amdgpu_gpu_recovery)
>> -            return 0;
>> +            return;
>>  
>>      /*
>>       * Iterate through all queue types to detect and reset problematic 
>> queues
>> @@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct 
>> amdgpu_userq_mgr *uq_mgr)
>>              }
>>      }
>>  
>> -    if (gpu_reset)
>> -            amdgpu_userq_gpu_reset(adev);
>> +    if (gpu_reset) {
>> +            struct amdgpu_reset_context reset_context;
>>  
>> -    return r;
>> +            memset(&reset_context, 0, sizeof(reset_context));
>> +
>> +            reset_context.method = AMD_RESET_METHOD_NONE;
>> +            reset_context.reset_req_dev = adev;
>> +            reset_context.src = AMDGPU_RESET_SRC_USERQ;
>> +            set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>> +            /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
>> +
>> +            amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>> +    }
>>  }
>>  
>>  static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> The function and the work handler for are using the same name and it causes 
> confusion to understand.
> queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
>                            msecs_to_jiffies(timeout_ms)); The queued item 
> here call the work item where the function name is same , so its better if we 
> can keep a different name


Mhm, usually it is good practice for naming the function and the work item 
similary.

Why do you see an issue with that?

Thanks,
Christian.

> 
> Regards
> Sunil Khatri
> 
>> @@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct 
>> work_struct *work)
>>              container_of(work, struct amdgpu_usermode_queue,
>>                           hang_detect_work.work);
>>  
>> -    amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
>> +    /*
>> +     * Don't schedule the work here! Scheduling or queue work from one reset
>> +     * handler to another is illegal if you don't take extra precautions!
>> +     */
>> +    amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
>>  }
>>  
>>  /*
>> @@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct 
>> amdgpu_usermode_queue *queue)
>>              break;
>>      }
>>  
>> -    schedule_delayed_work(&queue->hang_detect_work,
>> -                 msecs_to_jiffies(timeout_ms));
>> +    queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
>> +                       msecs_to_jiffies(timeout_ms));
>>  }
>>  
>>  void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 
>> doorbell)
>> @@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr 
>> *uq_mgr)
>>      if (ret) {
>>              drm_file_err(uq_mgr->file,
>>                           "Couldn't unmap all the queues, eviction failed 
>> ret=%d\n", ret);
>> -            amdgpu_userq_detect_and_reset_queues(uq_mgr);
>> +            amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
>> +                                         &uq_mgr->reset_work);
>> +            flush_work(&uq_mgr->reset_work);
> Flush work is called here with userq_mutex held? Is it ok to run for that 
> long time and not sure about it but the flush_work might try to take the 
> userq_mutex again, that was problem initially during reset.
>>      }
>>      return ret;
>>  }
>>  
>> -void amdgpu_userq_reset_work(struct work_struct *work)
>> -{
>> -    struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
>> -                                              userq_reset_work);
>> -    struct amdgpu_reset_context reset_context;
>> -
>> -    memset(&reset_context, 0, sizeof(reset_context));
>> -
>> -    reset_context.method = AMD_RESET_METHOD_NONE;
>> -    reset_context.reset_req_dev = adev;
>> -    reset_context.src = AMDGPU_RESET_SRC_USERQ;
>> -    set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>> -    /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
>> -
>> -    amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>> -}
>> -
>>  static void
>>  amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>>  {
>> @@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr 
>> *userq_mgr, struct drm_file *f
>>      userq_mgr->file = file_priv;
>>  
>>      INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>> +    INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
>>      return 0;
>>  }
>>  
>> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
>> +{
>> +    struct xarray *xa = &adev->userq_doorbell_xa;
>> +    struct amdgpu_usermode_queue *queue;
>> +    unsigned long flags, queue_id;
>> +
>> +    xa_lock_irqsave(xa, flags);
>> +    xa_for_each(xa, queue_id, queue) {
>> +            cancel_delayed_work(&queue->hang_detect_work);
>> +            cancel_work(&queue->userq_mgr->reset_work);
>> +    }
>> +    xa_unlock_irqrestore(xa, flags);
>> +}
>> +
>>  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
>>  {
>>      cancel_delayed_work_sync(&userq_mgr->resume_work);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> index 85f460e7c31b..49b33e2d6932 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> @@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
>>      u32                     xcp_id;
>>      int                     priority;
>>      struct dentry           *debugfs_queue;
>> -    struct delayed_work hang_detect_work;
>> +
>> +    /**
>> +     * @hang_detect_work:
>> +     *
>> +     * Delayed work which runs when userq_fences time out.
>> +     */
>> +    struct delayed_work     hang_detect_work;
>>      struct kref             refcount;
>>  
>>      struct list_head        userq_va_list;
>> @@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
>>      struct amdgpu_device            *adev;
>>      struct delayed_work             resume_work;
>>      struct drm_file                 *file;
>> +
>> +    /**
>> +     * @reset_work:
>> +     *
>> +     * Reset work which is used when eviction fails.
>> +     */
>> +    struct work_struct              reset_work;
>>      atomic_t                        userq_count[AMDGPU_RING_TYPE_MAX];
>>  };
>>  
>> @@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void 
>> *data, struct drm_file *filp
>>  int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct 
>> drm_file *file_priv,
>>                        struct amdgpu_device *adev);
>>  
>> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
>>  void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
>>  void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
>>

Re: [PATCH 08/11] drm/amdgpu: rework userq reset work handling

Reply via email to