On 3/6/26 09:04, Sunil Khatri wrote: > The userq create path publishes queues to global xarrays such as > userq_doorbell_xa and userq_xa before creation was fully complete. > Later on if create queue fails, teardown could free an already > visible queue, opening a UAF race with concurrent queue walkers. > Also calling amdgpu_userq_put in such cases complicates the cleanup. > > Solution is to defer queue publication until create succeeds and no > partially initialized queue is exposed.
Good work. > > Signed-off-by: Sunil Khatri <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 67 ++++++++++++----------- > 1 file changed, 34 insertions(+), 33 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 7d2f78899c47..937403beacdc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -758,7 +758,6 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > const struct amdgpu_userq_funcs *uq_funcs; > struct amdgpu_usermode_queue *queue; > struct amdgpu_db_info db_info; > - char *queue_name; > bool skip_map_queue; > u32 qid; > uint64_t index; > @@ -848,32 +847,6 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > goto unlock; > } > > - /* drop this refcount during queue destroy */ > - kref_init(&queue->refcount); > - > - /* Wait for mode-1 reset to complete */ > - down_read(&adev->reset_domain->sem); > - r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, > GFP_KERNEL)); > - if (r) { > - kfree(queue); > - up_read(&adev->reset_domain->sem); > - goto unlock; > - } > - > - r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, > - XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL); > - if (r) { > - drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n"); > - amdgpu_userq_fence_driver_free(queue); > - xa_erase_irq(&adev->userq_doorbell_xa, index); > - uq_funcs->mqd_destroy(queue); > - kfree(queue); > - r = -ENOMEM; > - up_read(&adev->reset_domain->sem); > - goto unlock; > - } > - up_read(&adev->reset_domain->sem); > - > /* don't map the queue if scheduling is halted */ > if (adev->userq_halt_for_enforce_isolation && > ((queue->queue_type == AMDGPU_HW_IP_GFX) || > @@ -885,28 +858,56 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > r = amdgpu_userq_map_helper(queue); > if (r) { > drm_file_err(uq_mgr->file, "Failed to map Queue\n"); > - xa_erase_irq(&adev->userq_doorbell_xa, index); > - xa_erase(&uq_mgr->userq_xa, qid); > - amdgpu_userq_fence_driver_free(queue); > uq_funcs->mqd_destroy(queue); > + amdgpu_userq_fence_driver_free(queue); > kfree(queue); > goto unlock; > } > } > > - queue_name = kasprintf(GFP_KERNEL, "queue-%d", qid); > - if (!queue_name) { > + /* drop this refcount during queue destroy */ > + kref_init(&queue->refcount); > + > + /* Wait for mode-1 reset to complete */ > + down_read(&adev->reset_domain->sem); > + r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, > + XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL); > + if (r) { > + drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n"); This should actually not be printed in the logs. At maximum it is a debug message. > + if (!skip_map_queue) > + amdgpu_userq_unmap_helper(queue); > + > + uq_funcs->mqd_destroy(queue); > + amdgpu_userq_fence_driver_free(queue); > + kfree(queue); > r = -ENOMEM; > + up_read(&adev->reset_domain->sem); I was just about to write that this screams for goto error handling and then saw that you do exactly that in patch #2. > goto unlock; > } > > + r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, > GFP_KERNEL)); > + if (r) { > + xa_erase(&uq_mgr->userq_xa, qid); > + if (!skip_map_queue) > + amdgpu_userq_unmap_helper(queue); > + > + uq_funcs->mqd_destroy(queue); > + amdgpu_userq_fence_driver_free(queue); > + kfree(queue); > + up_read(&adev->reset_domain->sem); > + goto unlock; > + } > + up_read(&adev->reset_domain->sem); > + > #if defined(CONFIG_DEBUG_FS) > + char queue_name[32]; > + > + scnprintf(queue_name, sizeof(queue_name), "queue_%d", qid); > /* Queue dentry per client to hold MQD information */ > queue->debugfs_queue = debugfs_create_dir(queue_name, > filp->debugfs_client); > debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, > &amdgpu_mqd_info_fops); > #endif A separate function for this would be nice to have. E.g. something like this: #if defined(CONFIG_DEBUG_FS) void amdgpu_userq_debugfs_create_queue(..) { ... } #else void amdgpu_userq_debugfs_create_queue(..) {} #endif Apart from those nit picks looks good to me. Christian. > amdgpu_userq_init_hang_detect_work(queue); > - kfree(queue_name); > > args->out.queue_id = qid; > atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
