userq: fix access to stale wptr mapping

Khatri, Sunil Tue, 05 May 2026 00:29:52 -0700


On 04-05-2026 06:35 pm, Christian König wrote:

On 5/4/26 14:56, Sunil Khatri wrote:

Use drm_exec to take both locks i.e vm root bo and
wptr_obj bo to access the mapping data properly.

This fixes the security issue of unmap the wptr_obj while
a queue creation is in progress and passing other
bo at same address.

Signed-off-by: Sunil Khatri <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 122 ++++++++++-----------
  1 file changed, 57 insertions(+), 65 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 501e2e10b4a6..3d4f83015488 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -30,34 +30,6 @@
  #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE
  #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE

-static int

-mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
-{
-       int ret;
-
-       ret = amdgpu_bo_reserve(bo, true);
-       if (ret) {
-               DRM_ERROR("Failed to reserve bo. ret %d\n", ret);
-               goto err_reserve_bo_failed;
-       }
-
-       ret = amdgpu_ttm_alloc_gart(&bo->tbo);
-       if (ret) {
-               DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);
-               goto err_map_bo_gart_failed;
-       }
-
-       amdgpu_bo_unreserve(bo);
-       bo = amdgpu_bo_ref(bo);
-
-       return 0;
-
-err_map_bo_gart_failed:
-       amdgpu_bo_unreserve(bo);
-err_reserve_bo_failed:
-       return ret;
-}
-
  static int
  mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
                              struct amdgpu_userq_mgr *uq_mgr,
@@ -65,55 +37,75 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev,
                              uint64_t wptr)
  {
        struct amdgpu_bo_va_mapping *wptr_mapping;
-       struct amdgpu_vm *wptr_vm;
        struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj;
+       struct amdgpu_bo *obj;
+       struct amdgpu_vm *vm = queue->vm;
+       struct drm_exec exec;
        int ret;

- wptr_vm = queue->vm;

-       ret = amdgpu_bo_reserve(wptr_vm->root.bo, false);
-       if (ret)
-               return ret;
-
        wptr &= AMDGPU_GMC_HOLE_MASK;
-       wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT);
-       amdgpu_bo_unreserve(wptr_vm->root.bo);
-       if (!wptr_mapping) {
-               DRM_ERROR("Failed to lookup wptr bo\n");
-               return -EINVAL;
-       }

- wptr_obj->obj = wptr_mapping->bo_va->base.bo;

-       if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
-               DRM_ERROR("Requested GART mapping for wptr bo larger than one 
page\n");
-               return -EINVAL;
-       }
+       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);

This should probably be 2 instead of 0.

Noted

+       drm_exec_until_all_locked(&exec) {
+               ret = amdgpu_vm_lock_pd(vm, &exec, 1);
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
+                       goto fail_lock;
+
+               wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> 
PAGE_SHIFT);
+               if (!wptr_mapping) {
+                       DRM_ERROR("Failed to lock up wptr bo\n");

Please drop that error message. It can spam the logs when userspace 
intentionally gives incorrect values.

Noted

+                       ret = -EINVAL;
+                       goto fail_lock;
+               }

- ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj);

-       if (ret) {
-               DRM_ERROR("Failed to map wptr bo to GART\n");
-               return ret;
-       }
+               obj = wptr_mapping->bo_va->base.bo;
+               ret = drm_exec_prepare_obj(&exec, &obj->tbo.base, 1);

Using drm_exec_lock_obj() should be sufficient.

We don't need a fence slot for this use case.

Sure

+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to prepare wptr bo\n");

Same here, that this can fail is normal handling. The worst case is OOM and 
that is already printed in the logs.

Noted

+                       goto fail_lock;
+               }

We got all the locks now, so the drm_exec_until_all_locked() loop can be closed 
here.

True, Noted

- ret = amdgpu_bo_reserve(wptr_obj->obj, true);

-       if (ret) {
-               DRM_ERROR("Failed to reserve wptr bo\n");
-               return ret;
-       }

+               /* mapping now should be stable since both the locks are held */
+               wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> 
PAGE_SHIFT);
+               if (!wptr_mapping) {
+                       DRM_ERROR("Failed to lock up wptr bo\n");
+                       ret = -EINVAL;
+                       goto fail_lock;
+               }

Doing that again is unecessary. We are holding the VM lock above while doing 
the lockup.

First time we had mapping at time we did not had wptr object lock. Isntit possible that either mapping is changed/updated or the bo is freed.At this moment we have both the locks and we


should be having correct mapping.

- /* TODO use eviction fence instead of pinning. */

-       ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);
-       if (ret) {
-               drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr 
bo\n");
-               goto unresv_bo;
-       }
+               wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo);

That is now unecessary as well.

We are holding the reference in original code and freeing it in destroy.Dont we want to hold any reference now, how do we make sure that its notbeing freed in between?

If we dont need a reference then we need to remove from the destroy pathtoo. In amdgpu_userq_destroy we are doing unpin and unref both db_objand wptr_obj.

+
+               if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) {
+                       DRM_ERROR("Requested wptr bo size is larger than one 
page\n");

Same, please drop that error message.

Noted

+                       ret = -EINVAL;
+                       goto fail_map;
+               }
+
+               ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo);
+               if (ret) {
+                       DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret);

That error message is useful.

Got it.

+                       goto fail_map;
+               }
+
+               /* TODO use eviction fence instead of pinning. */
+               ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT);

Oh! The piun needs to come before the alloc_gart! That's wrong in the existing 
code as well.

Got it.

+               if (ret) {
+                       DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret);

That error message makes sense.

regards
Sunil Khatri


Regards,
Christian.

+                       goto fail_map;
+               }

- queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);

-       amdgpu_bo_unreserve(wptr_obj->obj);
+               queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj);
+       }

+ drm_exec_fini(&exec);

        return 0;

-unresv_bo:

-       amdgpu_bo_unreserve(wptr_obj->obj);
+fail_map:
+       amdgpu_bo_unref(&wptr_obj->obj);
+fail_lock:
+       drm_exec_fini(&exec);
        return ret;

}

Re: [PATCH] drm/amdgpu/userq: fix access to stale wptr mapping

Reply via email to