On Fri, Sep 12, 2025 at 10:06 AM Christian König <christian.koe...@amd.com> wrote: > > On 12.09.25 15:02, Alex Deucher wrote: > > On Fri, Sep 12, 2025 at 7:17 AM Christian König > > <christian.koe...@amd.com> wrote: > >> > >> On 12.09.25 11:31, Jesse.Zhang wrote: > >>> The current user queue implementation uses a dual-mutex scheme with > >>> both per-device (adev->userq_mutex) and per-process (uq_mgr->userq_mutex) > >>> locking. This overcomplicated design creates potential deadlock scenarios > >>> and makes the code harder to maintain. > >>> > >>> Simplify the locking by switching entirely to the global adev->userq_mutex > >>> for all user queue operations. This approach: > >>> - Eliminates potential deadlocks between the two mutexes > >>> - Serializes all user queue handling at the kernel level > >>> - Provides adequate protection since user queue operations aren't > >>> performance-critical paths > >>> - Makes the code more maintainable with a single locking scheme > >>> > >>> Key changes: > >>> 1. Remove uq_mgr->userq_mutex entirely from amdgpu_userq_mgr > >>> 2. Consistently use adev->userq_mutex across all user queue operations > >>> 3. Add proper work flushing in amdgpu_userq_ensure_ev_fence() to > >>> prevent races when ensuring eviction fence availability > >>> 4. Maintain proper locking order throughout all user queue functions > >>> > >>> The serialization of all user queue operations is acceptable since > >>> these are management operations that don't need high parallelism. > >>> > >> > >> Absolutely clear NAK to that. > >> > >> This also serialized eviction handling making it completely useless. > >> > >> The global mutex should only be used when new clients open up their > >> connection for the first time or closes the last reference and never > >> otherwise. > > > > The problem is that the firmware reset code can affect multiple queues > > so we need some way to lock all queues during resets so we can > > properly update their status. > > That shouldn't be necessary. > > When all queues are affected we should use a sequence number to indicate the > queue generation. > > When only a subset of queues are affected then we would need to identify the > queues and then set exactly those to an error. > > So what exactly do we get on information from the FW?
FW gives us a list of doorbells for queues that were hung and reset. Alex > > Christian. > > > > > > Alex > > > >> > >> Regards, > >> Christian. > >> > >>> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > >>> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> > >>> --- > >>> .../drm/amd/amdgpu/amdgpu_eviction_fence.c | 7 ++-- > >>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 32 ++++++------------- > >>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 1 - > >>> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +-- > >>> 4 files changed, 16 insertions(+), 29 deletions(-) > >>> > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > >>> index 23d7d0b0d625..d6a07fac7df2 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > >>> @@ -105,9 +105,10 @@ amdgpu_eviction_fence_suspend_worker(struct > >>> work_struct *work) > >>> struct amdgpu_eviction_fence_mgr *evf_mgr = work_to_evf_mgr(work, > >>> suspend_work.work); > >>> struct amdgpu_fpriv *fpriv = evf_mgr_to_fpriv(evf_mgr); > >>> struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; > >>> + struct amdgpu_device *adev = uq_mgr->adev; > >>> struct amdgpu_eviction_fence *ev_fence; > >>> > >>> - mutex_lock(&uq_mgr->userq_mutex); > >>> + mutex_lock(&adev->userq_mutex); > >>> spin_lock(&evf_mgr->ev_fence_lock); > >>> ev_fence = evf_mgr->ev_fence; > >>> if (ev_fence) > >>> @@ -118,13 +119,13 @@ amdgpu_eviction_fence_suspend_worker(struct > >>> work_struct *work) > >>> > >>> amdgpu_userq_evict(uq_mgr, ev_fence); > >>> > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> dma_fence_put(&ev_fence->base); > >>> return; > >>> > >>> unlock: > >>> spin_unlock(&evf_mgr->ev_fence_lock); > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> } > >>> > >>> static bool amdgpu_eviction_fence_enable_signaling(struct dma_fence *f) > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > >>> index b649ac0cedff..af92450ea6eb 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > >>> @@ -198,17 +198,18 @@ amdgpu_userq_ensure_ev_fence(struct > >>> amdgpu_userq_mgr *uq_mgr, > >>> struct amdgpu_eviction_fence_mgr *evf_mgr) > >>> { > >>> struct amdgpu_eviction_fence *ev_fence; > >>> + struct amdgpu_device *adev = uq_mgr->adev; > >>> > >>> retry: > >>> /* Flush any pending resume work to create ev_fence */ > >>> flush_delayed_work(&uq_mgr->resume_work); > >>> > >>> - mutex_lock(&uq_mgr->userq_mutex); > >>> + mutex_lock(&adev->userq_mutex); > >>> spin_lock(&evf_mgr->ev_fence_lock); > >>> ev_fence = evf_mgr->ev_fence; > >>> spin_unlock(&evf_mgr->ev_fence_lock); > >>> if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) { > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> /* > >>> * Looks like there was no pending resume work, > >>> * add one now to create a valid eviction fence > >>> @@ -362,12 +363,12 @@ amdgpu_userq_destroy(struct drm_file *filp, int > >>> queue_id) > >>> int r = 0; > >>> > >>> cancel_delayed_work_sync(&uq_mgr->resume_work); > >>> - mutex_lock(&uq_mgr->userq_mutex); > >>> + mutex_lock(&adev->userq_mutex); > >>> > >>> queue = amdgpu_userq_find(uq_mgr, queue_id); > >>> if (!queue) { > >>> drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id > >>> to destroy\n"); > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> return -EINVAL; > >>> } > >>> amdgpu_userq_wait_for_last_fence(uq_mgr, queue); > >>> @@ -388,7 +389,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int > >>> queue_id) > >>> queue->state = AMDGPU_USERQ_STATE_HUNG; > >>> } > >>> amdgpu_userq_cleanup(uq_mgr, queue, queue_id); > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> > >>> pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); > >>> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > >>> @@ -478,7 +479,6 @@ amdgpu_userq_create(struct drm_file *filp, union > >>> drm_amdgpu_userq *args) > >>> pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); > >>> return r; > >>> } > >>> - > >>> /* > >>> * There could be a situation that we are creating a new queue while > >>> * the other queues under this UQ_mgr are suspended. So if there is > >>> any > >>> @@ -486,7 +486,6 @@ amdgpu_userq_create(struct drm_file *filp, union > >>> drm_amdgpu_userq *args) > >>> * > >>> * This will also make sure we have a valid eviction fence ready to > >>> be used. > >>> */ > >>> - mutex_lock(&adev->userq_mutex); > >>> amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); > >>> > >>> uq_funcs = adev->userq_funcs[args->in.ip_type]; > >>> @@ -588,9 +587,7 @@ amdgpu_userq_create(struct drm_file *filp, union > >>> drm_amdgpu_userq *args) > >>> kfree(queue_name); > >>> > >>> args->out.queue_id = qid; > >>> - > >>> unlock: > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> mutex_unlock(&adev->userq_mutex); > >>> > >>> return r; > >>> @@ -820,11 +817,12 @@ static void amdgpu_userq_restore_worker(struct > >>> work_struct *work) > >>> { > >>> struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, > >>> resume_work.work); > >>> struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > >>> + struct amdgpu_device *adev = uq_mgr->adev; > >>> int ret; > >>> > >>> flush_delayed_work(&fpriv->evf_mgr.suspend_work); > >>> > >>> - mutex_lock(&uq_mgr->userq_mutex); > >>> + mutex_lock(&adev->userq_mutex); > >>> > >>> ret = amdgpu_userq_validate_bos(uq_mgr); > >>> if (ret) { > >>> @@ -839,7 +837,7 @@ static void amdgpu_userq_restore_worker(struct > >>> work_struct *work) > >>> } > >>> > >>> unlock: > >>> - mutex_unlock(&uq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> } > >>> > >>> static int > >>> @@ -919,7 +917,6 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, > >>> int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct > >>> drm_file *file_priv, > >>> struct amdgpu_device *adev) > >>> { > >>> - mutex_init(&userq_mgr->userq_mutex); > >>> idr_init_base(&userq_mgr->userq_idr, 1); > >>> userq_mgr->adev = adev; > >>> userq_mgr->file = file_priv; > >>> @@ -942,7 +939,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > >>> *userq_mgr) > >>> cancel_delayed_work_sync(&userq_mgr->resume_work); > >>> > >>> mutex_lock(&adev->userq_mutex); > >>> - mutex_lock(&userq_mgr->userq_mutex); > >>> idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { > >>> amdgpu_userq_wait_for_last_fence(userq_mgr, queue); > >>> amdgpu_userq_unmap_helper(userq_mgr, queue); > >>> @@ -956,9 +952,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > >>> *userq_mgr) > >>> } > >>> } > >>> idr_destroy(&userq_mgr->userq_idr); > >>> - mutex_unlock(&userq_mgr->userq_mutex); > >>> mutex_unlock(&adev->userq_mutex); > >>> - mutex_destroy(&userq_mgr->userq_mutex); > >>> } > >>> > >>> int amdgpu_userq_suspend(struct amdgpu_device *adev) > >>> @@ -975,13 +969,11 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) > >>> mutex_lock(&adev->userq_mutex); > >>> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > >>> cancel_delayed_work_sync(&uqm->resume_work); > >>> - mutex_lock(&uqm->userq_mutex); > >>> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > >>> r = amdgpu_userq_unmap_helper(uqm, queue); > >>> if (r) > >>> ret = r; > >>> } > >>> - mutex_unlock(&uqm->userq_mutex); > >>> } > >>> mutex_unlock(&adev->userq_mutex); > >>> return ret; > >>> @@ -1000,13 +992,11 @@ int amdgpu_userq_resume(struct amdgpu_device *adev) > >>> > >>> mutex_lock(&adev->userq_mutex); > >>> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > >>> - mutex_lock(&uqm->userq_mutex); > >>> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > >>> r = amdgpu_userq_map_helper(uqm, queue); > >>> if (r) > >>> ret = r; > >>> } > >>> - mutex_unlock(&uqm->userq_mutex); > >>> } > >>> mutex_unlock(&adev->userq_mutex); > >>> return ret; > >>> @@ -1031,7 +1021,6 @@ int > >>> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, > >>> adev->userq_halt_for_enforce_isolation = true; > >>> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > >>> cancel_delayed_work_sync(&uqm->resume_work); > >>> - mutex_lock(&uqm->userq_mutex); > >>> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > >>> if (((queue->queue_type == AMDGPU_HW_IP_GFX) || > >>> (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && > >>> @@ -1041,7 +1030,6 @@ int > >>> amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, > >>> ret = r; > >>> } > >>> } > >>> - mutex_unlock(&uqm->userq_mutex); > >>> } > >>> mutex_unlock(&adev->userq_mutex); > >>> return ret; > >>> @@ -1065,7 +1053,6 @@ int > >>> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > >>> dev_warn(adev->dev, "userq scheduling already started!\n"); > >>> adev->userq_halt_for_enforce_isolation = false; > >>> list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > >>> - mutex_lock(&uqm->userq_mutex); > >>> idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > >>> if (((queue->queue_type == AMDGPU_HW_IP_GFX) || > >>> (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && > >>> @@ -1075,7 +1062,6 @@ int > >>> amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > >>> ret = r; > >>> } > >>> } > >>> - mutex_unlock(&uqm->userq_mutex); > >>> } > >>> mutex_unlock(&adev->userq_mutex); > >>> return ret; > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > >>> index c027dd916672..2d63308d55c3 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > >>> @@ -89,7 +89,6 @@ struct amdgpu_userq_funcs { > >>> /* Usermode queues for gfx */ > >>> struct amdgpu_userq_mgr { > >>> struct idr userq_idr; > >>> - struct mutex userq_mutex; > >>> struct amdgpu_device *adev; > >>> struct delayed_work resume_work; > >>> struct list_head list; > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > >>> index 95e91d1dc58a..daf3be92a39c 100644 > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c > >>> @@ -464,6 +464,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, > >>> void *data, > >>> u32 *bo_handles_write, num_write_bo_handles; > >>> u32 *syncobj_handles, num_syncobj_handles; > >>> u32 *bo_handles_read, num_read_bo_handles; > >>> + struct amdgpu_device *adev = userq_mgr->adev; > >>> int r, i, entry, rentry, wentry; > >>> struct dma_fence *fence; > >>> struct drm_exec exec; > >>> @@ -557,14 +558,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device > >>> *dev, void *data, > >>> /* Create a new fence */ > >>> r = amdgpu_userq_fence_create(queue, userq_fence, wptr, &fence); > >>> if (r) { > >>> - mutex_unlock(&userq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> kmem_cache_free(amdgpu_userq_fence_slab, userq_fence); > >>> goto put_gobj_write; > >>> } > >>> > >>> dma_fence_put(queue->last_fence); > >>> queue->last_fence = dma_fence_get(fence); > >>> - mutex_unlock(&userq_mgr->userq_mutex); > >>> + mutex_unlock(&adev->userq_mutex); > >>> > >>> drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, > >>> (num_read_bo_handles + num_write_bo_handles)); > >> >