On 3/20/26 11:53, Khatri, Sunil wrote:
>
> On 20-03-2026 03:43 pm, Christian König wrote:
>>
>> On 3/20/26 11:02, Khatri, Sunil wrote:
>>> On 20-03-2026 03:16 pm, Christian König wrote:
>>>> On 3/20/26 10:41, Sunil Khatri wrote:
>>>>> There is a possibility of deadlock when last reference to a queue is
>>>>> put in certain situations where mutex is already help when calling
>>>>> the amdgpu_userq_destroy.
>>>> From functions
>>>> As far as I can see that is illegal to begin with. Why are we doing that?
>>> This is to fix the deadlock that prike shared and many other places where
>>> deadlock could still be caused.
>>> There is a possibility of amdgpu_userq_put being called for last reference
>>> from amdgpu_userq_restore_worker or amdgpu_eviction_fence_suspend_worker
>>> via amdgpu_evf_mgr_shutdown
>>> and all these functions already hold the userq_mutex and on last reference
>>> when destroy is called it again takes userq_mutex and causing deadlock.
>>>
>>> Thats why when we are dropping the reference we pass the information of the
>>> handled could be called with lock already taken and hence the handling.
>> Well that sounds like completely broken handling.
>>
>> Why are dropping an userqueu reference while holding the lock in the first
>> place?
> we are doing at withing the locked state in most of the place in the code.
> few examples:
> static void amdgpu_userq_restore_worker(struct work_struct *work)
> {
> struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> struct dma_fence *ev_fence;
> int ret;
>
> mutex_lock(&uq_mgr->userq_mutex);
> ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr);
> if (!dma_fence_is_signaled(ev_fence))
> goto unlock;
>
> ret = amdgpu_userq_vm_validate(uq_mgr);
> if (ret) {
> drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
> goto unlock;
> }
>
> // Here the restore all is going through all the queues one bye one by
> doing get and put
That is ok, the question is why the heck is doing that get/put? See we need
get/put because we are *not* holding the lock.
When we are holding the lock no get/put is needed at all.
Regards,
Christian.
> and doing the restore of each queue. Now during put if its last reference due
> to race with another thread in putting we will
> call the destroy with locks taken which causes deadlock.
> ret = amdgpu_userq_restore_all(uq_mgr);
> if (ret) {
> drm_file_err(uq_mgr->file, "Failed to restore all queues\n");
> goto unlock;
> }
>
> unlock:
> mutex_unlock(&uq_mgr->userq_mutex);
> dma_fence_put(ev_fence);
> }
>
> Another example:
> static void
> amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
> {
> struct amdgpu_eviction_fence_mgr *evf_mgr =
> container_of(work, struct amdgpu_eviction_fence_mgr,
> suspend_work);
> struct amdgpu_fpriv *fpriv =
> container_of(evf_mgr, struct amdgpu_fpriv, evf_mgr);
> struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
> struct dma_fence *ev_fence;
> bool cookie;
>
> mutex_lock(&uq_mgr->userq_mutex);
>
> /*
> * This is intentionally after taking the userq_mutex since we do
> * allocate memory while holding this lock, but only after ensuring that
> * the eviction fence is signaled.
> */
> cookie = dma_fence_begin_signalling();
>
> ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
>
>
> /* Here in userq evict we do a ref get and put a various places while the
> mutex is already taken */
>
> amdgpu_userq_evict(uq_mgr, !evf_mgr->shutdown);
>
> /*
> * Signaling the eviction fence must be done while holding the
> * userq_mutex. Otherwise we won't resume the queues before issuing the
> * next fence.
> */
> dma_fence_signal(ev_fence);
> dma_fence_end_signalling(cookie);
> dma_fence_put(ev_fence);
> mutex_unlock(&uq_mgr->userq_mutex);
>
> }
>
> Regards
> Sunil.
>
>
>>
>> Regards,
>> Christian.
>>
>>> Regards
>>> Sunil khatri
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> So based on the thread where it could be
>>>>> locked we pass the locked information in the destroy functionality
>>>>> to avoid taking the lock again.
>>>>>
>>>>> Cc: Liang, Prike <[email protected]>
>>>>> Suggested-by: Liang, Prike <[email protected]>
>>>>> Signed-off-by: Sunil Khatri <[email protected]>
>>>>> ---
>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 52 +++++++++++++------
>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 2 +-
>>>>> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 4 +-
>>>>> 3 files changed, 40 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>>>> index ced9ade44be4..9482664e9c2c 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>>>>> @@ -617,13 +617,17 @@ amdgpu_userq_get_doorbell_index(struct
>>>>> amdgpu_userq_mgr *uq_mgr,
>>>>> }
>>>>> static int
>>>>> -amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct
>>>>> amdgpu_usermode_queue *queue)
>>>>> +amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct
>>>>> amdgpu_usermode_queue *queue,
>>>>> + bool locked)
>>>>> {
>>>>> struct amdgpu_device *adev = uq_mgr->adev;
>>>>> int r = 0;
>>>>> - cancel_delayed_work_sync(&uq_mgr->resume_work);
>>>>> + /* It safe to unlock since we are in destroy and the queue ref is
>>>>> only this */
>>>>> + if (locked)
>>>>> + mutex_unlock(&uq_mgr->userq_mutex);
>>>>> + cancel_delayed_work_sync(&uq_mgr->resume_work);
>>>>> /* Cancel any pending hang detection work and cleanup */
>>>>> cancel_delayed_work_sync(&queue->hang_detect_work);
>>>>> @@ -657,13 +661,27 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr
>>>>> *uq_mgr, struct amdgpu_usermode_que
>>>>> queue->state = AMDGPU_USERQ_STATE_HUNG;
>>>>> }
>>>>> amdgpu_userq_cleanup(queue);
>>>>> - mutex_unlock(&uq_mgr->userq_mutex);
>>>>> +
>>>>> + if (!locked)
>>>>> + mutex_unlock(&uq_mgr->userq_mutex);
>>>>> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>>>>> return r;
>>>>> }
>>>>> +static void amdgpu_userq_kref_destroy_locked(struct kref *kref)
>>>>> +{
>>>>> + int r;
>>>>> + struct amdgpu_usermode_queue *queue =
>>>>> + container_of(kref, struct amdgpu_usermode_queue, refcount);
>>>>> + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>>>>> +
>>>>> + r = amdgpu_userq_destroy(uq_mgr, queue, true);
>>>>> + if (r)
>>>>> + drm_file_err(uq_mgr->file, "Failed to destroy usermode queue
>>>>> %d\n", r);
>>>>> +}
>>>>> +
>>>>> static void amdgpu_userq_kref_destroy(struct kref *kref)
>>>>> {
>>>>> int r;
>>>>> @@ -671,7 +689,7 @@ static void amdgpu_userq_kref_destroy(struct kref
>>>>> *kref)
>>>>> container_of(kref, struct amdgpu_usermode_queue, refcount);
>>>>> struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>>>>> - r = amdgpu_userq_destroy(uq_mgr, queue);
>>>>> + r = amdgpu_userq_destroy(uq_mgr, queue, false);
>>>>> if (r)
>>>>> drm_file_err(uq_mgr->file, "Failed to destroy usermode queue
>>>>> %d\n", r);
>>>>> }
>>>>> @@ -689,10 +707,14 @@ struct amdgpu_usermode_queue
>>>>> *amdgpu_userq_get(struct amdgpu_userq_mgr *uq_mgr,
>>>>> return queue;
>>>>> }
>>>>> -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue)
>>>>> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked)
>>>>> {
>>>>> - if (queue)
>>>>> - kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
>>>>> + if (queue) {
>>>>> + if (locked)
>>>>> + kref_put(&queue->refcount, amdgpu_userq_kref_destroy_locked);
>>>>> + else
>>>>> + kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
>>>>> + }
>>>>> }
>>>>> static int amdgpu_userq_priority_permit(struct drm_file *filp,
>>>>> @@ -978,7 +1000,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void
>>>>> *data,
>>>>> if (!queue)
>>>>> return -ENOENT;
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, false);
>>>>> break;
>>>>> }
>>>>> @@ -1007,7 +1029,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr
>>>>> *uq_mgr)
>>>>> drm_file_err(uq_mgr->file,
>>>>> "trying restore queue without va mapping\n");
>>>>> queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> continue;
>>>>> }
>>>>> @@ -1015,7 +1037,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr
>>>>> *uq_mgr)
>>>>> if (r)
>>>>> ret = r;
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> }
>>>>> if (ret)
>>>>> @@ -1258,7 +1280,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr
>>>>> *uq_mgr)
>>>>> r = amdgpu_userq_preempt_helper(queue);
>>>>> if (r)
>>>>> ret = r;
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> }
>>>>> if (ret)
>>>>> @@ -1298,17 +1320,17 @@ amdgpu_userq_wait_for_signal(struct
>>>>> amdgpu_userq_mgr *uq_mgr)
>>>>> struct dma_fence *f = queue->last_fence;
>>>>> if (!f || dma_fence_is_signaled(f)) {
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> continue;
>>>>> }
>>>>> ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>>>>> if (ret <= 0) {
>>>>> drm_file_err(uq_mgr->file, "Timed out waiting for
>>>>> fence=%llu:%llu\n",
>>>>> f->context, f->seqno);
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> return -ETIMEDOUT;
>>>>> }
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, true);
>>>>> }
>>>>> return 0;
>>>>> @@ -1366,7 +1388,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr
>>>>> *userq_mgr)
>>>>> if (!queue)
>>>>> break;
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, false);
>>>>> }
>>>>> xa_destroy(&userq_mgr->userq_xa);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>>>> index f0abc16d02cc..2a496e74ec6a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>>>>> @@ -116,7 +116,7 @@ struct amdgpu_db_info {
>>>>> };
>>>>> struct amdgpu_usermode_queue *amdgpu_userq_get(struct
>>>>> amdgpu_userq_mgr *uq_mgr, u32 qid);
>>>>> -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue);
>>>>> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked);
>>>>> int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct
>>>>> drm_file *filp);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>>>> index 18390d37a7e0..10e08cb6bd13 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
>>>>> @@ -612,7 +612,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev,
>>>>> void *data,
>>>>> kfree(syncobj_handles);
>>>>> if (queue)
>>>>> - amdgpu_userq_put(queue);
>>>>> + amdgpu_userq_put(queue, false);
>>>>> return r;
>>>>> }
>>>>> @@ -914,7 +914,7 @@ amdgpu_userq_wait_return_fence_info(struct drm_file
>>>>> *filp,
>>>>> r = 0;
>>>>> put_waitq:
>>>>> - amdgpu_userq_put(waitq);
>>>>> + amdgpu_userq_put(waitq, false);
>>>>> free_fences:
>>>>> while (num_fences--)