On 3/20/26 10:41, Sunil Khatri wrote: > There is a possibility of deadlock when last reference to a queue is > put in certain situations where mutex is already help when calling > the amdgpu_userq_destroy.
As far as I can see that is illegal to begin with. Why are we doing that? Regards, Christian. > So based on the thread where it could be > locked we pass the locked information in the destroy functionality > to avoid taking the lock again. > > Cc: Liang, Prike <[email protected]> > Suggested-by: Liang, Prike <[email protected]> > Signed-off-by: Sunil Khatri <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 52 +++++++++++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 2 +- > .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 4 +- > 3 files changed, 40 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index ced9ade44be4..9482664e9c2c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -617,13 +617,17 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr > *uq_mgr, > } > > static int > -amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct > amdgpu_usermode_queue *queue) > +amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct > amdgpu_usermode_queue *queue, > + bool locked) > { > struct amdgpu_device *adev = uq_mgr->adev; > int r = 0; > > - cancel_delayed_work_sync(&uq_mgr->resume_work); > + /* It safe to unlock since we are in destroy and the queue ref is only > this */ > + if (locked) > + mutex_unlock(&uq_mgr->userq_mutex); > > + cancel_delayed_work_sync(&uq_mgr->resume_work); > /* Cancel any pending hang detection work and cleanup */ > cancel_delayed_work_sync(&queue->hang_detect_work); > > @@ -657,13 +661,27 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_usermode_que > queue->state = AMDGPU_USERQ_STATE_HUNG; > } > amdgpu_userq_cleanup(queue); > - mutex_unlock(&uq_mgr->userq_mutex); > + > + if (!locked) > + mutex_unlock(&uq_mgr->userq_mutex); > > pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > > return r; > } > > +static void amdgpu_userq_kref_destroy_locked(struct kref *kref) > +{ > + int r; > + struct amdgpu_usermode_queue *queue = > + container_of(kref, struct amdgpu_usermode_queue, refcount); > + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; > + > + r = amdgpu_userq_destroy(uq_mgr, queue, true); > + if (r) > + drm_file_err(uq_mgr->file, "Failed to destroy usermode queue > %d\n", r); > +} > + > static void amdgpu_userq_kref_destroy(struct kref *kref) > { > int r; > @@ -671,7 +689,7 @@ static void amdgpu_userq_kref_destroy(struct kref *kref) > container_of(kref, struct amdgpu_usermode_queue, refcount); > struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; > > - r = amdgpu_userq_destroy(uq_mgr, queue); > + r = amdgpu_userq_destroy(uq_mgr, queue, false); > if (r) > drm_file_err(uq_mgr->file, "Failed to destroy usermode queue > %d\n", r); > } > @@ -689,10 +707,14 @@ struct amdgpu_usermode_queue *amdgpu_userq_get(struct > amdgpu_userq_mgr *uq_mgr, > return queue; > } > > -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue) > +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked) > { > - if (queue) > - kref_put(&queue->refcount, amdgpu_userq_kref_destroy); > + if (queue) { > + if (locked) > + kref_put(&queue->refcount, > amdgpu_userq_kref_destroy_locked); > + else > + kref_put(&queue->refcount, amdgpu_userq_kref_destroy); > + } > } > > static int amdgpu_userq_priority_permit(struct drm_file *filp, > @@ -978,7 +1000,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void > *data, > if (!queue) > return -ENOENT; > > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, false); > break; > } > > @@ -1007,7 +1029,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr > *uq_mgr) > drm_file_err(uq_mgr->file, > "trying restore queue without va > mapping\n"); > queue->state = AMDGPU_USERQ_STATE_INVALID_VA; > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > continue; > } > > @@ -1015,7 +1037,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr > *uq_mgr) > if (r) > ret = r; > > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > } > > if (ret) > @@ -1258,7 +1280,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > r = amdgpu_userq_preempt_helper(queue); > if (r) > ret = r; > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > } > > if (ret) > @@ -1298,17 +1320,17 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr > *uq_mgr) > struct dma_fence *f = queue->last_fence; > > if (!f || dma_fence_is_signaled(f)) { > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > continue; > } > ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100)); > if (ret <= 0) { > drm_file_err(uq_mgr->file, "Timed out waiting for > fence=%llu:%llu\n", > f->context, f->seqno); > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > return -ETIMEDOUT; > } > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, true); > } > > return 0; > @@ -1366,7 +1388,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > *userq_mgr) > if (!queue) > break; > > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, false); > } > > xa_destroy(&userq_mgr->userq_xa); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index f0abc16d02cc..2a496e74ec6a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -116,7 +116,7 @@ struct amdgpu_db_info { > }; > > struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr > *uq_mgr, u32 qid); > -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue); > +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked); > > int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file > *filp); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > index 18390d37a7e0..10e08cb6bd13 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > @@ -612,7 +612,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, > void *data, > kfree(syncobj_handles); > > if (queue) > - amdgpu_userq_put(queue); > + amdgpu_userq_put(queue, false); > > return r; > } > @@ -914,7 +914,7 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp, > r = 0; > > put_waitq: > - amdgpu_userq_put(waitq); > + amdgpu_userq_put(waitq, false); > > free_fences: > while (num_fences--)
