On 3/20/26 11:02, Khatri, Sunil wrote:
> 
> On 20-03-2026 03:16 pm, Christian König wrote:
>> On 3/20/26 10:41, Sunil Khatri wrote:
>>> There is a possibility of deadlock when last reference to a queue is
>>> put in certain situations where mutex is already help when calling
>>> the amdgpu_userq_destroy.
>>  From functions
>> As far as I can see that is illegal to begin with. Why are we doing that?
> This is to fix the deadlock that prike shared and many other places where 
> deadlock could still be caused.
> There is a possibility of amdgpu_userq_put being called for last reference 
> from amdgpu_userq_restore_worker or amdgpu_eviction_fence_suspend_worker via 
> amdgpu_evf_mgr_shutdown
> and all these functions already hold the userq_mutex and on last reference 
> when destroy is called it again takes userq_mutex and causing deadlock.
> 
> Thats why when we are dropping the reference we pass the information of the 
> handled could be called with lock already taken and hence the handling.

Well that sounds like completely broken handling.

Why are dropping an userqueu reference while holding the lock in the first 
place?

Regards,
Christian.

> 
> Regards
> Sunil khatri
> 
>>
>> Regards,
>> Christian.
>>
>>> So based on the thread where it could be
>>> locked we pass the locked information in the destroy functionality
>>> to avoid taking the lock again.
>>>
>>> Cc: Liang, Prike <[email protected]>
>>> Suggested-by: Liang, Prike <[email protected]>
>>> Signed-off-by: Sunil Khatri <[email protected]>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 52 +++++++++++++------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |  2 +-
>>>   .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  4 +-
>>>   3 files changed, 40 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> index ced9ade44be4..9482664e9c2c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>> @@ -617,13 +617,17 @@ amdgpu_userq_get_doorbell_index(struct 
>>> amdgpu_userq_mgr *uq_mgr,
>>>   }
>>>     static int
>>> -amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
>>> amdgpu_usermode_queue *queue)
>>> +amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
>>> amdgpu_usermode_queue *queue,
>>> +             bool locked)
>>>   {
>>>       struct amdgpu_device *adev = uq_mgr->adev;
>>>       int r = 0;
>>>   -    cancel_delayed_work_sync(&uq_mgr->resume_work);
>>> +    /* It safe to unlock since we are in destroy and the queue ref is only 
>>> this */
>>> +    if (locked)
>>> +        mutex_unlock(&uq_mgr->userq_mutex);
>>>   +    cancel_delayed_work_sync(&uq_mgr->resume_work);
>>>       /* Cancel any pending hang detection work and cleanup */
>>>       cancel_delayed_work_sync(&queue->hang_detect_work);
>>>   @@ -657,13 +661,27 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr 
>>> *uq_mgr, struct amdgpu_usermode_que
>>>           queue->state = AMDGPU_USERQ_STATE_HUNG;
>>>       }
>>>       amdgpu_userq_cleanup(queue);
>>> -    mutex_unlock(&uq_mgr->userq_mutex);
>>> +
>>> +    if (!locked)
>>> +        mutex_unlock(&uq_mgr->userq_mutex);
>>>         pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>>>         return r;
>>>   }
>>>   +static void amdgpu_userq_kref_destroy_locked(struct kref *kref)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_usermode_queue *queue =
>>> +        container_of(kref, struct amdgpu_usermode_queue, refcount);
>>> +    struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>>> +
>>> +    r = amdgpu_userq_destroy(uq_mgr, queue, true);
>>> +    if (r)
>>> +        drm_file_err(uq_mgr->file, "Failed to destroy usermode queue 
>>> %d\n", r);
>>> +}
>>> +
>>>   static void amdgpu_userq_kref_destroy(struct kref *kref)
>>>   {
>>>       int r;
>>> @@ -671,7 +689,7 @@ static void amdgpu_userq_kref_destroy(struct kref *kref)
>>>           container_of(kref, struct amdgpu_usermode_queue, refcount);
>>>       struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>>>   -    r = amdgpu_userq_destroy(uq_mgr, queue);
>>> +    r = amdgpu_userq_destroy(uq_mgr, queue, false);
>>>       if (r)
>>>           drm_file_err(uq_mgr->file, "Failed to destroy usermode queue 
>>> %d\n", r);
>>>   }
>>> @@ -689,10 +707,14 @@ struct amdgpu_usermode_queue *amdgpu_userq_get(struct 
>>> amdgpu_userq_mgr *uq_mgr,
>>>       return queue;
>>>   }
>>>   -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue)
>>> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked)
>>>   {
>>> -    if (queue)
>>> -        kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
>>> +    if (queue) {
>>> +        if (locked)
>>> +            kref_put(&queue->refcount, amdgpu_userq_kref_destroy_locked);
>>> +        else
>>> +            kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
>>> +    }
>>>   }
>>>     static int amdgpu_userq_priority_permit(struct drm_file *filp,
>>> @@ -978,7 +1000,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void 
>>> *data,
>>>           if (!queue)
>>>               return -ENOENT;
>>>   -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, false);
>>>           break;
>>>       }
>>>   @@ -1007,7 +1029,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
>>> *uq_mgr)
>>>               drm_file_err(uq_mgr->file,
>>>                        "trying restore queue without va mapping\n");
>>>               queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
>>> -            amdgpu_userq_put(queue);
>>> +            amdgpu_userq_put(queue, true);
>>>               continue;
>>>           }
>>>   @@ -1015,7 +1037,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
>>> *uq_mgr)
>>>           if (r)
>>>               ret = r;
>>>   -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, true);
>>>       }
>>>         if (ret)
>>> @@ -1258,7 +1280,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr 
>>> *uq_mgr)
>>>           r = amdgpu_userq_preempt_helper(queue);
>>>           if (r)
>>>               ret = r;
>>> -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, true);
>>>       }
>>>         if (ret)
>>> @@ -1298,17 +1320,17 @@ amdgpu_userq_wait_for_signal(struct 
>>> amdgpu_userq_mgr *uq_mgr)
>>>           struct dma_fence *f = queue->last_fence;
>>>             if (!f || dma_fence_is_signaled(f)) {
>>> -            amdgpu_userq_put(queue);
>>> +            amdgpu_userq_put(queue, true);
>>>               continue;
>>>           }
>>>           ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>>>           if (ret <= 0) {
>>>               drm_file_err(uq_mgr->file, "Timed out waiting for 
>>> fence=%llu:%llu\n",
>>>                        f->context, f->seqno);
>>> -            amdgpu_userq_put(queue);
>>> +            amdgpu_userq_put(queue, true);
>>>               return -ETIMEDOUT;
>>>           }
>>> -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, true);
>>>       }
>>>         return 0;
>>> @@ -1366,7 +1388,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr 
>>> *userq_mgr)
>>>           if (!queue)
>>>               break;
>>>   -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, false);
>>>       }
>>>         xa_destroy(&userq_mgr->userq_xa);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>> index f0abc16d02cc..2a496e74ec6a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>> @@ -116,7 +116,7 @@ struct amdgpu_db_info {
>>>   };
>>>     struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr 
>>> *uq_mgr, u32 qid);
>>> -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue);
>>> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked);
>>>     int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct 
>>> drm_file *filp);
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>> index 18390d37a7e0..10e08cb6bd13 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>> @@ -612,7 +612,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, 
>>> void *data,
>>>       kfree(syncobj_handles);
>>>         if (queue)
>>> -        amdgpu_userq_put(queue);
>>> +        amdgpu_userq_put(queue, false);
>>>         return r;
>>>   }
>>> @@ -914,7 +914,7 @@ amdgpu_userq_wait_return_fence_info(struct drm_file 
>>> *filp,
>>>           r = 0;
>>>     put_waitq:
>>> -    amdgpu_userq_put(waitq);
>>> +    amdgpu_userq_put(waitq, false);
>>>     free_fences:
>>>       while (num_fences--)

Reply via email to