On 3/20/26 10:41, Sunil Khatri wrote:
> There is a possibility of deadlock when last reference to a queue is
> put in certain situations where mutex is already help when calling
> the amdgpu_userq_destroy.

As far as I can see that is illegal to begin with. Why are we doing that?

Regards,
Christian.

> So based on the thread where it could be
> locked we pass the locked information in the destroy functionality
> to avoid taking the lock again.
> 
> Cc: Liang, Prike <[email protected]>
> Suggested-by: Liang, Prike <[email protected]>
> Signed-off-by: Sunil Khatri <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 52 +++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |  2 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  4 +-
>  3 files changed, 40 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index ced9ade44be4..9482664e9c2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -617,13 +617,17 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr 
> *uq_mgr,
>  }
>  
>  static int
> -amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
> amdgpu_usermode_queue *queue)
> +amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
> amdgpu_usermode_queue *queue,
> +                  bool locked)
>  {
>       struct amdgpu_device *adev = uq_mgr->adev;
>       int r = 0;
>  
> -     cancel_delayed_work_sync(&uq_mgr->resume_work);
> +     /* It safe to unlock since we are in destroy and the queue ref is only 
> this */
> +     if (locked)
> +             mutex_unlock(&uq_mgr->userq_mutex);
>  
> +     cancel_delayed_work_sync(&uq_mgr->resume_work);
>       /* Cancel any pending hang detection work and cleanup */
>       cancel_delayed_work_sync(&queue->hang_detect_work);
>  
> @@ -657,13 +661,27 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
> struct amdgpu_usermode_que
>               queue->state = AMDGPU_USERQ_STATE_HUNG;
>       }
>       amdgpu_userq_cleanup(queue);
> -     mutex_unlock(&uq_mgr->userq_mutex);
> +
> +     if (!locked)
> +             mutex_unlock(&uq_mgr->userq_mutex);
>  
>       pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>  
>       return r;
>  }
>  
> +static void amdgpu_userq_kref_destroy_locked(struct kref *kref)
> +{
> +     int r;
> +     struct amdgpu_usermode_queue *queue =
> +             container_of(kref, struct amdgpu_usermode_queue, refcount);
> +     struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
> +
> +     r = amdgpu_userq_destroy(uq_mgr, queue, true);
> +     if (r)
> +             drm_file_err(uq_mgr->file, "Failed to destroy usermode queue 
> %d\n", r);
> +}
> +
>  static void amdgpu_userq_kref_destroy(struct kref *kref)
>  {
>       int r;
> @@ -671,7 +689,7 @@ static void amdgpu_userq_kref_destroy(struct kref *kref)
>               container_of(kref, struct amdgpu_usermode_queue, refcount);
>       struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>  
> -     r = amdgpu_userq_destroy(uq_mgr, queue);
> +     r = amdgpu_userq_destroy(uq_mgr, queue, false);
>       if (r)
>               drm_file_err(uq_mgr->file, "Failed to destroy usermode queue 
> %d\n", r);
>  }
> @@ -689,10 +707,14 @@ struct amdgpu_usermode_queue *amdgpu_userq_get(struct 
> amdgpu_userq_mgr *uq_mgr,
>       return queue;
>  }
>  
> -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue)
> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked)
>  {
> -     if (queue)
> -             kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
> +     if (queue) {
> +             if (locked)
> +                     kref_put(&queue->refcount, 
> amdgpu_userq_kref_destroy_locked);
> +             else
> +                     kref_put(&queue->refcount, amdgpu_userq_kref_destroy);
> +     }
>  }
>  
>  static int amdgpu_userq_priority_permit(struct drm_file *filp,
> @@ -978,7 +1000,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void 
> *data,
>               if (!queue)
>                       return -ENOENT;
>  
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, false);
>               break;
>       }
>  
> @@ -1007,7 +1029,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
> *uq_mgr)
>                       drm_file_err(uq_mgr->file,
>                                    "trying restore queue without va 
> mapping\n");
>                       queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
> -                     amdgpu_userq_put(queue);
> +                     amdgpu_userq_put(queue, true);
>                       continue;
>               }
>  
> @@ -1015,7 +1037,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
> *uq_mgr)
>               if (r)
>                       ret = r;
>  
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, true);
>       }
>  
>       if (ret)
> @@ -1258,7 +1280,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
>               r = amdgpu_userq_preempt_helper(queue);
>               if (r)
>                       ret = r;
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, true);
>       }
>  
>       if (ret)
> @@ -1298,17 +1320,17 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr 
> *uq_mgr)
>               struct dma_fence *f = queue->last_fence;
>  
>               if (!f || dma_fence_is_signaled(f)) {
> -                     amdgpu_userq_put(queue);
> +                     amdgpu_userq_put(queue, true);
>                       continue;
>               }
>               ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
>               if (ret <= 0) {
>                       drm_file_err(uq_mgr->file, "Timed out waiting for 
> fence=%llu:%llu\n",
>                                    f->context, f->seqno);
> -                     amdgpu_userq_put(queue);
> +                     amdgpu_userq_put(queue, true);
>                       return -ETIMEDOUT;
>               }
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, true);
>       }
>  
>       return 0;
> @@ -1366,7 +1388,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr 
> *userq_mgr)
>               if (!queue)
>                       break;
>  
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, false);
>       }
>  
>       xa_destroy(&userq_mgr->userq_xa);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index f0abc16d02cc..2a496e74ec6a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -116,7 +116,7 @@ struct amdgpu_db_info {
>  };
>  
>  struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr 
> *uq_mgr, u32 qid);
> -void amdgpu_userq_put(struct amdgpu_usermode_queue *queue);
> +void amdgpu_userq_put(struct amdgpu_usermode_queue *queue, bool locked);
>  
>  int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file 
> *filp);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 18390d37a7e0..10e08cb6bd13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -612,7 +612,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, 
> void *data,
>       kfree(syncobj_handles);
>  
>       if (queue)
> -             amdgpu_userq_put(queue);
> +             amdgpu_userq_put(queue, false);
>  
>       return r;
>  }
> @@ -914,7 +914,7 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp,
>               r = 0;
>  
>  put_waitq:
> -     amdgpu_userq_put(waitq);
> +     amdgpu_userq_put(waitq, false);
>  
>  free_fences:
>       while (num_fences--)

Reply via email to