On 20-03-2026 03:43 pm, Christian König wrote:

On 3/20/26 11:02, Khatri, Sunil wrote:
On 20-03-2026 03:16 pm, Christian König wrote:
On 3/20/26 10:41, Sunil Khatri wrote:
There is a possibility of deadlock when last reference to a queue is
put in certain situations where mutex is already help when calling
the amdgpu_userq_destroy.
  From functions
As far as I can see that is illegal to begin with. Why are we doing that?
This is to fix the deadlock that prike shared and many other places where 
deadlock could still be caused.
There is a possibility of amdgpu_userq_put being called for last reference from 
amdgpu_userq_restore_worker or amdgpu_eviction_fence_suspend_worker via 
amdgpu_evf_mgr_shutdown
and all these functions already hold the userq_mutex and on last reference when 
destroy is called it again takes userq_mutex and causing deadlock.

Thats why when we are dropping the reference we pass the information of the 
handled could be called with lock already taken and hence the handling.
Well that sounds like completely broken handling.

Why are dropping an userqueu reference while holding the lock in the first 
place?
we are doing at withing the locked state in most of the place in the code.
few examples:
static void amdgpu_userq_restore_worker(struct work_struct *work)
{
    struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
    struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
    struct dma_fence *ev_fence;
    int ret;

    mutex_lock(&uq_mgr->userq_mutex);
    ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr);
    if (!dma_fence_is_signaled(ev_fence))
        goto unlock;

    ret = amdgpu_userq_vm_validate(uq_mgr);
    if (ret) {
        drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
        goto unlock;
    }

    // Here the restore all is going through all the queues one bye one by doing get and put and doing the restore of each queue. Now during put if its last reference due to race with another thread in putting we will
call the destroy with locks taken which causes deadlock.
    ret = amdgpu_userq_restore_all(uq_mgr);
    if (ret) {
        drm_file_err(uq_mgr->file, "Failed to restore all queues\n");
        goto unlock;
    }

unlock:
    mutex_unlock(&uq_mgr->userq_mutex);
    dma_fence_put(ev_fence);
}

Another example:
static void
amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
{
    struct amdgpu_eviction_fence_mgr *evf_mgr =
        container_of(work, struct amdgpu_eviction_fence_mgr,
                 suspend_work);
    struct amdgpu_fpriv *fpriv =
        container_of(evf_mgr, struct amdgpu_fpriv, evf_mgr);
    struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
    struct dma_fence *ev_fence;
    bool cookie;

    mutex_lock(&uq_mgr->userq_mutex);

    /*
     * This is intentionally after taking the userq_mutex since we do
     * allocate memory while holding this lock, but only after ensuring that
     * the eviction fence is signaled.
     */
    cookie = dma_fence_begin_signalling();

    ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);


/* Here in userq evict we do a ref get and put a various places while the mutex is already taken */

    amdgpu_userq_evict(uq_mgr, !evf_mgr->shutdown);

    /*
     * Signaling the eviction fence must be done while holding the
     * userq_mutex. Otherwise we won't resume the queues before issuing the
     * next fence.
     */
    dma_fence_signal(ev_fence);
    dma_fence_end_signalling(cookie);
    dma_fence_put(ev_fence);
    mutex_unlock(&uq_mgr->userq_mutex);

}

Regards
Sunil.



Regards,
Christian.

Regards
Sunil khatri

Regards,
Christian.

So based on the thread where it could be
locked we pass the locked information in the destroy functionality
to avoid taking the lock again.

Cc: Liang, Prike <[email protected]>
Suggested-by: Liang, Prike <[email protected]>
Signed-off-by: Sunil Khatri <[email protected]>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 52 +++++++++++++------
   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |  2 +-
   .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  4 +-
   3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index ced9ade44be4..9482664e9c2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -617,13 +617,17 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr 
*uq_mgr,
   }
     static int
-amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
amdgpu_usermode_queue *queue)
+amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
amdgpu_usermode_queue *queue,
+             bool locked)
   {
       struct amdgpu_device *adev = uq_mgr->adev;
       int r = 0;
   -    cancel_delayed_work_sync(&uq_mgr->resume_work);
+    /* It safe to unlock since we are in destroy and the queue ref is only 
this */
+    if (locked)
+        mutex_unlock(&uq_mgr->userq_mutex);
   +    cancel_delayed_work_sync(&uq_mgr->resume_work);
       /* Cancel any pending hang detection work and cleanup */
       cancel_delayed_work_sync(&queue->hang_detect_work);
   @@ -657,13 +661,27 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
struct amdgpu_usermode_que
           queue->state = AMDGPU_USERQ_STATE_HUNG;
       }
       amdgpu_userq_cleanup(queue);
-    mutex_unlock(&uq_mgr->userq_mutex);
+
+    if (!locked)
+        mutex_unlock(&uq_mgr->userq_mutex);
         pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
         return r;
   }
   +static void amdgpu_userq_kref_destroy_locked(struct kref *kref)
+{
+    int r;
+    struct amdgpu_usermode_queue *queue =
+        container_of(kref, struct amdgpu_usermode_queue, refcount);
+    struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+
+    r = amdgpu_userq_destroy(uq_mgr, queue, true);
+    if (r)
+        drm_file_err(uq_mgr->file, "Failed to destroy usermode queue %d\n", r);
+}
+
   static void amdgpu_userq_kref_destroy(struct kref *kref)
   {
       int r;
@@ -671,7 +689,7 @@ static void amdgpu_userq_kref_destroy(struct kref *kref)
           container_of(kref, struct amdgpu_usermode_queue, refcount);
       struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
   -    r = amdgpu_userq_destroy(uq_mgr, queue);
+    r = amdgpu_userq_destroy(uq_mgr, queue, false);
       if (r)
           drm_file_err(uq_mgr->file, "Failed to destroy usermode queue %d\n", 
r);
   }
@@ -689,10 +707,14 @@ struct amdgpu_usermode_queue *amdgpu_userq_get(struct 
amdgpu_userq_mgr *uq_mgr,
       return queue;
   }
   -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue)
+void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked)
   {
-    if (queue)
-        kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
+    if (queue) {
+        if (locked)
+            kref_put(&queue->refcount, amdgpu_userq_kref_destroy_locked);
+        else
+            kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
+    }
   }
     static int amdgpu_userq_priority_permit(struct drm_file *filp,
@@ -978,7 +1000,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
           if (!queue)
               return -ENOENT;
   -        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, false);
           break;
       }
   @@ -1007,7 +1029,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
*uq_mgr)
               drm_file_err(uq_mgr->file,
                        "trying restore queue without va mapping\n");
               queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
-            amdgpu_userq_put(queue);
+            amdgpu_userq_put(queue, true);
               continue;
           }
   @@ -1015,7 +1037,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
*uq_mgr)
           if (r)
               ret = r;
   -        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, true);
       }
         if (ret)
@@ -1258,7 +1280,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
           r = amdgpu_userq_preempt_helper(queue);
           if (r)
               ret = r;
-        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, true);
       }
         if (ret)
@@ -1298,17 +1320,17 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr 
*uq_mgr)
           struct dma_fence *f = queue->last_fence;
             if (!f || dma_fence_is_signaled(f)) {
-            amdgpu_userq_put(queue);
+            amdgpu_userq_put(queue, true);
               continue;
           }
           ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
           if (ret <= 0) {
               drm_file_err(uq_mgr->file, "Timed out waiting for 
fence=%llu:%llu\n",
                        f->context, f->seqno);
-            amdgpu_userq_put(queue);
+            amdgpu_userq_put(queue, true);
               return -ETIMEDOUT;
           }
-        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, true);
       }
         return 0;
@@ -1366,7 +1388,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr 
*userq_mgr)
           if (!queue)
               break;
   -        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, false);
       }
         xa_destroy(&userq_mgr->userq_xa);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index f0abc16d02cc..2a496e74ec6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -116,7 +116,7 @@ struct amdgpu_db_info {
   };
     struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr 
*uq_mgr, u32 qid);
-void amdgpu_userq_put(struct amdgpu_usermode_queue *queue);
+void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked);
     int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file 
*filp);
   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 18390d37a7e0..10e08cb6bd13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -612,7 +612,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void 
*data,
       kfree(syncobj_handles);
         if (queue)
-        amdgpu_userq_put(queue);
+        amdgpu_userq_put(queue, false);
         return r;
   }
@@ -914,7 +914,7 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp,
           r = 0;
     put_waitq:
-    amdgpu_userq_put(waitq);
+    amdgpu_userq_put(waitq, false);
     free_fences:
       while (num_fences--)

Reply via email to