On 4/23/26 12:43, Khatri, Sunil wrote:
>
> On 21-04-2026 06:25 pm, Christian König wrote:
>> It is illegal to schedule reset work from another reset work!
>>
>> Fix this by scheduling the userq reset work directly on the work queue
>> of the reset domain.
>>
>> Not fully tested, I leave that to the IGT test cases.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 -
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 84 +++++++++++-----------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 16 ++++-
>> 4 files changed, 60 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 39894e38fee4..17341e384caf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1191,7 +1191,6 @@ struct amdgpu_device {
>> bool apu_prefer_gtt;
>>
>> bool userq_halt_for_enforce_isolation;
>> - struct work_struct userq_reset_work;
>> struct amdgpu_uid *uid_info;
>>
>> struct amdgpu_uma_carveout_info uma_info;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index b11c4b5fa8fc..cf61be17e061 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3786,7 +3786,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>> }
>>
>> INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> - INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
>>
>> amdgpu_coredump_init(adev);
>>
>> @@ -5477,7 +5476,7 @@ static inline void
>> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>> if (!amdgpu_sriov_vf(adev))
>> cancel_work(&adev->reset_work);
>> #endif
>> - cancel_work(&adev->userq_reset_work);
>> + amdgpu_userq_mgr_cancel_reset_work(adev);
>>
>> if (adev->kfd.dev)
>> cancel_work(&adev->kfd.reset_work);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> index 0a4c39d83adc..ad6dac17dd21 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> @@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct
>> amdgpu_device *adev,
>> return false;
>> }
>>
>> -static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
>> -{
>> - if (amdgpu_device_should_recover_gpu(adev)) {
>> - amdgpu_reset_domain_schedule(adev->reset_domain,
>> - &adev->userq_reset_work);
>> - /* Wait for the reset job to complete */
>> - flush_work(&adev->userq_reset_work);
>> - }
>> -}
>> -
>> -static int
>> -amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
>> +static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
>> {
>> + struct amdgpu_userq_mgr *uq_mgr =
>> + container_of(work, struct amdgpu_userq_mgr,
>> + reset_work);
>> struct amdgpu_device *adev = uq_mgr->adev;
>> const int queue_types[] = {
>> AMDGPU_RING_TYPE_COMPUTE,
>> @@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct
>> amdgpu_userq_mgr *uq_mgr)
>> };
>> const int num_queue_types = ARRAY_SIZE(queue_types);
>> bool gpu_reset = false;
>> - int r = 0;
>> - int i;
>> + int i, r;
>>
>> if (unlikely(adev->debug_disable_gpu_ring_reset)) {
>> dev_err(adev->dev, "userq reset disabled by debug mask\n");
>> - return 0;
>> + return;
>> }
>>
>> /*
>> @@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct
>> amdgpu_userq_mgr *uq_mgr)
>> * skip all reset detection logic
>> */
>> if (!amdgpu_gpu_recovery)
>> - return 0;
>> + return;
>>
>> /*
>> * Iterate through all queue types to detect and reset problematic
>> queues
>> @@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct
>> amdgpu_userq_mgr *uq_mgr)
>> }
>> }
>>
>> - if (gpu_reset)
>> - amdgpu_userq_gpu_reset(adev);
>> + if (gpu_reset) {
>> + struct amdgpu_reset_context reset_context;
>>
>> - return r;
>> + memset(&reset_context, 0, sizeof(reset_context));
>> +
>> + reset_context.method = AMD_RESET_METHOD_NONE;
>> + reset_context.reset_req_dev = adev;
>> + reset_context.src = AMDGPU_RESET_SRC_USERQ;
>> + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>> + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
>> +
>> + amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>> + }
>> }
>>
>> static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> The function and the work handler for are using the same name and it causes
> confusion to understand.
> queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
> msecs_to_jiffies(timeout_ms)); The queued item
> here call the work item where the function name is same , so its better if we
> can keep a different name
Mhm, usually it is good practice for naming the function and the work item
similary.
Why do you see an issue with that?
Thanks,
Christian.
>
> Regards
> Sunil Khatri
>
>> @@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct
>> work_struct *work)
>> container_of(work, struct amdgpu_usermode_queue,
>> hang_detect_work.work);
>>
>> - amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
>> + /*
>> + * Don't schedule the work here! Scheduling or queue work from one reset
>> + * handler to another is illegal if you don't take extra precautions!
>> + */
>> + amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
>> }
>>
>> /*
>> @@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct
>> amdgpu_usermode_queue *queue)
>> break;
>> }
>>
>> - schedule_delayed_work(&queue->hang_detect_work,
>> - msecs_to_jiffies(timeout_ms));
>> + queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
>> + msecs_to_jiffies(timeout_ms));
>> }
>>
>> void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32
>> doorbell)
>> @@ -1256,28 +1260,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr
>> *uq_mgr)
>> if (ret) {
>> drm_file_err(uq_mgr->file,
>> "Couldn't unmap all the queues, eviction failed
>> ret=%d\n", ret);
>> - amdgpu_userq_detect_and_reset_queues(uq_mgr);
>> + amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
>> + &uq_mgr->reset_work);
>> + flush_work(&uq_mgr->reset_work);
> Flush work is called here with userq_mutex held? Is it ok to run for that
> long time and not sure about it but the flush_work might try to take the
> userq_mutex again, that was problem initially during reset.
>> }
>> return ret;
>> }
>>
>> -void amdgpu_userq_reset_work(struct work_struct *work)
>> -{
>> - struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
>> - userq_reset_work);
>> - struct amdgpu_reset_context reset_context;
>> -
>> - memset(&reset_context, 0, sizeof(reset_context));
>> -
>> - reset_context.method = AMD_RESET_METHOD_NONE;
>> - reset_context.reset_req_dev = adev;
>> - reset_context.src = AMDGPU_RESET_SRC_USERQ;
>> - set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
>> - /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
>> -
>> - amdgpu_device_gpu_recover(adev, NULL, &reset_context);
>> -}
>> -
>> static void
>> amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
>> {
>> @@ -1311,9 +1300,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr
>> *userq_mgr, struct drm_file *f
>> userq_mgr->file = file_priv;
>>
>> INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
>> + INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
>> return 0;
>> }
>>
>> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
>> +{
>> + struct xarray *xa = &adev->userq_doorbell_xa;
>> + struct amdgpu_usermode_queue *queue;
>> + unsigned long flags, queue_id;
>> +
>> + xa_lock_irqsave(xa, flags);
>> + xa_for_each(xa, queue_id, queue) {
>> + cancel_delayed_work(&queue->hang_detect_work);
>> + cancel_work(&queue->userq_mgr->reset_work);
>> + }
>> + xa_unlock_irqrestore(xa, flags);
>> +}
>> +
>> void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
>> {
>> cancel_delayed_work_sync(&userq_mgr->resume_work);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> index 85f460e7c31b..49b33e2d6932 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> @@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
>> u32 xcp_id;
>> int priority;
>> struct dentry *debugfs_queue;
>> - struct delayed_work hang_detect_work;
>> +
>> + /**
>> + * @hang_detect_work:
>> + *
>> + * Delayed work which runs when userq_fences time out.
>> + */
>> + struct delayed_work hang_detect_work;
>> struct kref refcount;
>>
>> struct list_head userq_va_list;
>> @@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
>> struct amdgpu_device *adev;
>> struct delayed_work resume_work;
>> struct drm_file *file;
>> +
>> + /**
>> + * @reset_work:
>> + *
>> + * Reset work which is used when eviction fails.
>> + */
>> + struct work_struct reset_work;
>> atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
>> };
>>
>> @@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void
>> *data, struct drm_file *filp
>> int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct
>> drm_file *file_priv,
>> struct amdgpu_device *adev);
>>
>> +void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
>> void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
>> void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
>>