On 3/27/26 11:36, Prike Liang wrote:
> The correct fix is to tie the global xa entry lifetime to the
> queue lifetime: insert in amdgpu_userq_create() and erase in
> amdgpu_userq_cleanup(), both at the well-defined doorbell_index key,
> making the operation O(1) and resolve the fence driver UAF problem
> by binding the userq driver fence to per queue.
> 
> Signed-off-by: Prike Liang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  5 -----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  4 +---
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 20 +------------------
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c        | 10 +++++-----
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c        | 10 +++++-----
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c        | 11 +++++-----
>  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c        | 10 +++++-----
>  drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c        | 10 +++++-----
>  8 files changed, 28 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 49e7881750fa..8bc591deb546 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1045,11 +1045,6 @@ struct amdgpu_device {
>       struct amdgpu_mqd               mqds[AMDGPU_HW_IP_NUM];
>       const struct amdgpu_userq_funcs *userq_funcs[AMDGPU_HW_IP_NUM];
>  
> -     /* xarray used to retrieve the user queue fence driver reference
> -      * in the EOP interrupt handler to signal the particular user
> -      * queue fence.
> -      */
> -     struct xarray                   userq_xa;
>       /**
>        * @userq_doorbell_xa: Global user queue map (doorbell index → queue)
>        * Key: doorbell_index (unique global identifier for the queue)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0c0489395edf..a7b519f670a5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3757,15 +3757,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>       spin_lock_init(&adev->virt.rlcg_reg_lock);
>       spin_lock_init(&adev->wb.lock);
>  
> -     xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
> -
>       INIT_LIST_HEAD(&adev->reset_list);
>  
>       INIT_LIST_HEAD(&adev->ras_list);
>  
>       INIT_LIST_HEAD(&adev->pm.od_kobj_list);
>  
> -     xa_init(&adev->userq_doorbell_xa);
> +     xa_init_flags(&adev->userq_doorbell_xa, XA_FLAGS_LOCK_IRQ);
>  
>       INIT_DELAYED_WORK(&adev->delayed_init_work,
>                         amdgpu_device_delayed_init_work_handler);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 6b33c2428b2d..87560c1251d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -81,7 +81,6 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device 
> *adev,
>                                   struct amdgpu_usermode_queue *userq)
>  {
>       struct amdgpu_userq_fence_driver *fence_drv;
> -     unsigned long flags;
>       int r;
>  
>       fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL);
> @@ -104,19 +103,10 @@ int amdgpu_userq_fence_driver_alloc(struct 
> amdgpu_device *adev,
>       fence_drv->context = dma_fence_context_alloc(1);
>       get_task_comm(fence_drv->timeline_name, current);
>  
> -     xa_lock_irqsave(&adev->userq_xa, flags);
> -     r = xa_err(__xa_store(&adev->userq_xa, userq->doorbell_index,
> -                           fence_drv, GFP_KERNEL));
> -     xa_unlock_irqrestore(&adev->userq_xa, flags);
> -     if (r)
> -             goto free_seq64;
> -
>       userq->fence_drv = fence_drv;
>  
>       return 0;
>  
> -free_seq64:
> -     amdgpu_seq64_free(adev, fence_drv->va);
>  free_fence_drv:
>       kfree(fence_drv);
>  
> @@ -187,11 +177,9 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref)
>       struct amdgpu_userq_fence_driver *fence_drv = container_of(ref,
>                                        struct amdgpu_userq_fence_driver,
>                                        refcount);
> -     struct amdgpu_userq_fence_driver *xa_fence_drv;
>       struct amdgpu_device *adev = fence_drv->adev;
>       struct amdgpu_userq_fence *fence, *tmp;
> -     struct xarray *xa = &adev->userq_xa;
> -     unsigned long index, flags;
> +     unsigned long flags;
>       struct dma_fence *f;
>  
>       spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
> @@ -208,12 +196,6 @@ void amdgpu_userq_fence_driver_destroy(struct kref *ref)
>       }
>       spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
>  
> -     xa_lock_irqsave(xa, flags);
> -     xa_for_each(xa, index, xa_fence_drv)
> -             if (xa_fence_drv == fence_drv)
> -                     __xa_erase(xa, index);
> -     xa_unlock_irqrestore(xa, flags);
> -
>       /* Free seq64 memory */
>       amdgpu_seq64_free(adev, fence_drv->va);
>       kfree(fence_drv);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 78d1f3eb522e..c0921977b853 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6488,14 +6488,14 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device 
> *adev,
>       DRM_DEBUG("IH: CP EOP\n");
>  
>       if (adev->enable_mes && doorbell_offset) {
> -             struct amdgpu_userq_fence_driver *fence_drv = NULL;
> -             struct xarray *xa = &adev->userq_xa;
> +             struct amdgpu_usermode_queue *queue = NULL;

Please drop initializing the local variables to NULL, we have automated 
checkers which complain about that.



> +             struct xarray *xa = &adev->userq_doorbell_xa;
>               unsigned long flags;
>  
>               xa_lock_irqsave(xa, flags);
> -             fence_drv = xa_load(xa, doorbell_offset);
> -             if (fence_drv)
> -                     amdgpu_userq_fence_driver_process(fence_drv);
> +             queue = xa_load(xa, doorbell_offset);
> +             if (queue)
> +                     amdgpu_userq_fence_driver_process(queue->fence_drv);
>               xa_unlock_irqrestore(xa, flags);
>       } else {
>               me_id = (entry->ring_id & 0x0c) >> 2;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index a418ae609c36..d4faceab8f88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -4854,14 +4854,14 @@ static int gfx_v12_0_eop_irq(struct amdgpu_device 
> *adev,
>       DRM_DEBUG("IH: CP EOP\n");
>  
>       if (adev->enable_mes && doorbell_offset) {
> -             struct amdgpu_userq_fence_driver *fence_drv = NULL;
> -             struct xarray *xa = &adev->userq_xa;
> +             struct xarray *xa = &adev->userq_doorbell_xa;
> +             struct amdgpu_usermode_queue *queue = NULL;

Same here and maybe other places I might have missed.

Apart from that looks really good to me.

Regards,
Christian.

>               unsigned long flags;
>  
>               xa_lock_irqsave(xa, flags);
> -             fence_drv = xa_load(xa, doorbell_offset);
> -             if (fence_drv)
> -                     amdgpu_userq_fence_driver_process(fence_drv);
> +             queue = xa_load(xa, doorbell_offset);
> +             if (queue)
> +                     amdgpu_userq_fence_driver_process(queue->fence_drv);
>               xa_unlock_irqrestore(xa, flags);
>       } else {
>               me_id = (entry->ring_id & 0x0c) >> 2;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> index db49582a211f..642ddd9473cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
> @@ -3643,14 +3643,15 @@ static int gfx_v12_1_eop_irq(struct amdgpu_device 
> *adev,
>       DRM_DEBUG("IH: CP EOP\n");
>  
>       if (adev->enable_mes && doorbell_offset) {
> -             struct amdgpu_userq_fence_driver *fence_drv = NULL;
> -             struct xarray *xa = &adev->userq_xa;
> +             struct xarray *xa = &adev->userq_doorbell_xa;
> +             struct amdgpu_usermode_queue *queue = NULL;
>               unsigned long flags;
>  
>               xa_lock_irqsave(xa, flags);
> -             fence_drv = xa_load(xa, doorbell_offset);
> -             if (fence_drv)
> -                     amdgpu_userq_fence_driver_process(fence_drv);
> +             queue = xa_load(xa, doorbell_offset);
> +             if (queue)
> +                     amdgpu_userq_fence_driver_process(queue->fence_drv);
> +
>               xa_unlock_irqrestore(xa, flags);
>       } else {
>               me_id = (entry->ring_id & 0x0c) >> 2;
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> index b005672f2f96..301e9364aff1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> @@ -1662,16 +1662,16 @@ static int sdma_v6_0_process_fence_irq(struct 
> amdgpu_device *adev,
>       u32 doorbell_offset = entry->src_data[0];
>  
>       if (adev->enable_mes && doorbell_offset) {
> -             struct amdgpu_userq_fence_driver *fence_drv = NULL;
> -             struct xarray *xa = &adev->userq_xa;
> +             struct amdgpu_usermode_queue *queue = NULL;
> +             struct xarray *xa = &adev->userq_doorbell_xa;
>               unsigned long flags;
>  
>               doorbell_offset >>= SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>  
>               xa_lock_irqsave(xa, flags);
> -             fence_drv = xa_load(xa, doorbell_offset);
> -             if (fence_drv)
> -                     amdgpu_userq_fence_driver_process(fence_drv);
> +             queue = xa_load(xa, doorbell_offset);
> +             if (queue)
> +                     amdgpu_userq_fence_driver_process(queue->fence_drv);
>               xa_unlock_irqrestore(xa, flags);
>       }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> index 5679a94d0815..2660e8f08daa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> @@ -1594,16 +1594,16 @@ static int sdma_v7_0_process_fence_irq(struct 
> amdgpu_device *adev,
>       u32 doorbell_offset = entry->src_data[0];
>  
>       if (adev->enable_mes && doorbell_offset) {
> -             struct amdgpu_userq_fence_driver *fence_drv = NULL;
> -             struct xarray *xa = &adev->userq_xa;
> +             struct xarray *xa = &adev->userq_doorbell_xa;
> +             struct amdgpu_usermode_queue *queue = NULL;
>               unsigned long flags;
>  
>               doorbell_offset >>= SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
>  
>               xa_lock_irqsave(xa, flags);
> -             fence_drv = xa_load(xa, doorbell_offset);
> -             if (fence_drv)
> -                     amdgpu_userq_fence_driver_process(fence_drv);
> +             queue = xa_load(xa, doorbell_offset);
> +             if (queue)
> +                     amdgpu_userq_fence_driver_process(queue->fence_drv);
>               xa_unlock_irqrestore(xa, flags);
>       }
>  

Reply via email to