AMD General

Reviewed-by: Prike Liang <[email protected]>

Regards,
      Prike

> -----Original Message-----
> From: Christian König <[email protected]>
> Sent: Tuesday, April 28, 2026 2:27 AM
> To: Deucher, Alexander <[email protected]>; Liang, Prike
> <[email protected]>; Khatri, Sunil <[email protected]>; Zhang,
> Jesse(Jie) <[email protected]>
> Cc: [email protected]
> Subject: [PATCH 7/8] drm/amdgpu: fix handling in amdgpu_userq_create
>
> Well mostly the same issues the other code had as well:
>
> 1. Memory allocation while holding the userq_mutex lock is forbidden!
> 2. Things were created/started/published in the wrong order.
> 3. Error messages on invalid input parameters can spam the logs.
> 4. Error messages on memory allocation failures are usually superflous
>    as well.
>
> Signed-off-by: Christian König <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 118 ++++++++++------------
>  1 file changed, 52 insertions(+), 66 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 0374a70b631c..28e0695b53e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -718,14 +718,14 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>       const struct amdgpu_userq_funcs *uq_funcs;
>       struct amdgpu_usermode_queue *queue;
>       struct amdgpu_db_info db_info;
> -     bool skip_map_queue;
> -     u32 qid;
>       uint64_t index;
> -     int r = 0;
> -     int priority =
> -             (args->in.flags &
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
> -             AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
> +     int priority;
> +     u32 qid;
> +     int r;
>
> +     priority =
> +             (args->in.flags &
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK)
> +             >>
> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
>       r = amdgpu_userq_priority_permit(filp, priority);
>       if (r)
>               return r;
> @@ -738,40 +738,43 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>
>       uq_funcs = adev->userq_funcs[args->in.ip_type];
>       if (!uq_funcs) {
> -             drm_file_err(uq_mgr->file, "Usermode queue is not supported for 
> this
> IP (%u)\n",
> -                          args->in.ip_type);
>               r = -EINVAL;
>               goto err_pm_runtime;
>       }
>
>       queue = kzalloc(sizeof(struct amdgpu_usermode_queue), GFP_KERNEL);
>       if (!queue) {
> -             drm_file_err(uq_mgr->file, "Failed to allocate memory for 
> queue\n");
>               r = -ENOMEM;
>               goto err_pm_runtime;
>       }
>
> +     kref_init(&queue->refcount);
>       INIT_LIST_HEAD(&queue->userq_va_list);
>       queue->doorbell_handle = args->in.doorbell_handle;
>       queue->queue_type = args->in.ip_type;
>       queue->vm = &fpriv->vm;
>       queue->priority = priority;
> -
> -     db_info.queue_type = queue->queue_type;
> -     db_info.doorbell_handle = queue->doorbell_handle;
> -     db_info.db_obj = &queue->db_obj;
> -     db_info.doorbell_offset = args->in.doorbell_offset;
> -
>       queue->userq_mgr = uq_mgr;
> +     INIT_DELAYED_WORK(&queue->hang_detect_work,
> +                       amdgpu_userq_hang_detect_work);
>
> -     /* Validate the userq virtual address.*/
> -     r = amdgpu_bo_reserve(fpriv->vm.root.bo, false);
> +     mutex_init(&queue->fence_drv_lock);
> +     xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC);
> +     r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv);
>       if (r)
>               goto free_queue;
>
> -     if (amdgpu_userq_input_va_validate(adev, queue, args->in.queue_va, args-
> >in.queue_size) ||
> -         amdgpu_userq_input_va_validate(adev, queue, args->in.rptr_va,
> AMDGPU_GPU_PAGE_SIZE) ||
> -         amdgpu_userq_input_va_validate(adev, queue, args->in.wptr_va,
> AMDGPU_GPU_PAGE_SIZE)) {
> +     /* Make sure the queue can actually run with those virtual addresses. */
> +     r = amdgpu_bo_reserve(fpriv->vm.root.bo, false);
> +     if (r)
> +             goto free_fence_drv;
> +
> +     if (amdgpu_userq_input_va_validate(adev, queue, args->in.queue_va,
> +                                        args->in.queue_size) ||
> +         amdgpu_userq_input_va_validate(adev, queue, args->in.rptr_va,
> +                                        AMDGPU_GPU_PAGE_SIZE) ||
> +         amdgpu_userq_input_va_validate(adev, queue, args->in.wptr_va,
> +                                        AMDGPU_GPU_PAGE_SIZE)) {
>               r = -EINVAL;
>               amdgpu_bo_unreserve(fpriv->vm.root.bo);
>               goto clean_mapping;
> @@ -779,6 +782,10 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>       amdgpu_bo_unreserve(fpriv->vm.root.bo);
>
>       /* Convert relative doorbell offset into absolute doorbell index */
> +     db_info.queue_type = queue->queue_type;
> +     db_info.doorbell_handle = queue->doorbell_handle;
> +     db_info.db_obj = &queue->db_obj;
> +     db_info.doorbell_offset = args->in.doorbell_offset;
>       index = amdgpu_userq_get_doorbell_index(uq_mgr, &db_info, filp);
>       if (index == (uint64_t)-EINVAL) {
>               drm_file_err(uq_mgr->file, "Failed to get doorbell for 
> queue\n"); @@ -
> 787,82 +794,61 @@ amdgpu_userq_create(struct drm_file *filp, union
> drm_amdgpu_userq *args)
>       }
>
>       queue->doorbell_index = index;
> -     mutex_init(&queue->fence_drv_lock);
> -     xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC);
> -     r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv);
> -     if (r) {
> -             drm_file_err(uq_mgr->file, "Failed to alloc fence driver\n");
> -             goto clean_mapping;
> -     }
> -
>       r = uq_funcs->mqd_create(queue, &args->in);
>       if (r) {
>               drm_file_err(uq_mgr->file, "Failed to create Queue\n");
> -             goto clean_fence_driver;
> +             goto clean_mapping;
>       }
>
> +     r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue,
> +                             GFP_KERNEL));
> +     if (r)
> +             goto clean_mqd;
> +
>       amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
>
>       /* don't map the queue if scheduling is halted */
> -     if (adev->userq_halt_for_enforce_isolation &&
> -         ((queue->queue_type == AMDGPU_HW_IP_GFX) ||
> -          (queue->queue_type == AMDGPU_HW_IP_COMPUTE)))
> -             skip_map_queue = true;
> -     else
> -             skip_map_queue = false;
> -     if (!skip_map_queue) {
> +     if (!adev->userq_halt_for_enforce_isolation ||
> +         ((queue->queue_type != AMDGPU_HW_IP_GFX) &&
> +          (queue->queue_type != AMDGPU_HW_IP_COMPUTE))) {
>               r = amdgpu_userq_map_helper(queue);
>               if (r) {
>                       drm_file_err(uq_mgr->file, "Failed to map Queue\n");
> -                     goto clean_mqd;
> +                     mutex_unlock(&uq_mgr->userq_mutex);
> +                     goto clean_doorbell;
>               }
>       }
>
> -     /* drop this refcount during queue destroy */
> -     kref_init(&queue->refcount);
> -
> -     /* Wait for mode-1 reset to complete */
> -     down_read(&adev->reset_domain->sem);
> +     atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
> +     mutex_unlock(&uq_mgr->userq_mutex);
>
>       r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
> -                  XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT),
> GFP_KERNEL);
> -     if (r) {
> -             if (!skip_map_queue)
> -                     amdgpu_userq_unmap_helper(queue);
> -             r = -ENOMEM;
> -             goto clean_reset_domain;
> -     }
> -
> -     r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue,
> GFP_KERNEL));
> +                  XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT),
> +                  GFP_KERNEL);
>       if (r) {
> -             xa_erase(&uq_mgr->userq_xa, qid);
> -             if (!skip_map_queue)
> -                     amdgpu_userq_unmap_helper(queue);
> -             goto clean_reset_domain;
> +             /*
> +              * This drops the last reference which should take care of
> +              * all cleanup.
> +              */
> +             amdgpu_userq_put(queue);
> +             return r;
>       }
> -     up_read(&adev->reset_domain->sem);
>
>       amdgpu_debugfs_userq_init(filp, queue, qid);
> -     INIT_DELAYED_WORK(&queue->hang_detect_work,
> -                       amdgpu_userq_hang_detect_work);
> -
>       args->out.queue_id = qid;
> -     atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
> -     mutex_unlock(&uq_mgr->userq_mutex);
>       return 0;
>
> -clean_reset_domain:
> -     up_read(&adev->reset_domain->sem);
> +clean_doorbell:
> +     xa_erase_irq(&adev->userq_doorbell_xa, index);
>  clean_mqd:
> -     mutex_unlock(&uq_mgr->userq_mutex);
>       uq_funcs->mqd_destroy(queue);
> -clean_fence_driver:
> -     amdgpu_userq_fence_driver_free(queue);
>  clean_mapping:
>       amdgpu_bo_reserve(fpriv->vm.root.bo, true);
>       amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
>       amdgpu_bo_unreserve(fpriv->vm.root.bo);
>       mutex_destroy(&queue->fence_drv_lock);
> +free_fence_drv:
> +     amdgpu_userq_fence_driver_free(queue);
>  free_queue:
>       kfree(queue);
>  err_pm_runtime:
> --
> 2.43.0

Reply via email to