Re: [PATCH v4] drm/amdgpu/userq: refcount userqueues to avoid any race conditions

2026-03-03 Thread Khatri, Sunil



On 03-03-2026 07:36 pm, Christian König wrote:

On 3/3/26 14:49, Khatri, Sunil wrote:

@@ -1000,11 +1018,14 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void 
*data,
   drm_file_err(filp, "Failed to create usermode queue\n");
   break;
   -    case AMDGPU_USERQ_OP_FREE:
-    r = amdgpu_userq_destroy(filp, args->in.queue_id);
-    if (r)
-    drm_file_err(filp, "Failed to destroy usermode queue\n");
+    case AMDGPU_USERQ_OP_FREE: {
+    queue = xa_erase(&fpriv->userq_mgr.userq_xa, args->in.queue_id);

You need to have xa_lock around that or otherwise xa_load/kref_get can race.

__xa_erase is unlocked version, but xa_erase internally does have xa_lock, 
since there is nothing else but just erase here to it made more sense to have 
clear xa_erase.

No, that doesn't work like that.

The problem is that xa_erase() takes the lock only optionally when it can't 
exchange the entry with NULL atomically (e.g. because a full page needs to be 
freed up or similar).

But in our use case taking the lock is mandatory because kref_get otherwise 
doesn't work.
Ok sure i would add explicit locks in that case. Thanks for that 
information.


Regards
Sunil Khatri


Regards,
Christian.



Re: [PATCH v4] drm/amdgpu/userq: refcount userqueues to avoid any race conditions

2026-03-03 Thread Christian König
On 3/3/26 14:49, Khatri, Sunil wrote:
>>> @@ -1000,11 +1018,14 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void 
>>> *data,
>>>   drm_file_err(filp, "Failed to create usermode queue\n");
>>>   break;
>>>   -    case AMDGPU_USERQ_OP_FREE:
>>> -    r = amdgpu_userq_destroy(filp, args->in.queue_id);
>>> -    if (r)
>>> -    drm_file_err(filp, "Failed to destroy usermode queue\n");
>>> +    case AMDGPU_USERQ_OP_FREE: {
>>> +    queue = xa_erase(&fpriv->userq_mgr.userq_xa, args->in.queue_id);
>> You need to have xa_lock around that or otherwise xa_load/kref_get can race.
> __xa_erase is unlocked version, but xa_erase internally does have xa_lock, 
> since there is nothing else but just erase here to it made more sense to have 
> clear xa_erase.

No, that doesn't work like that.

The problem is that xa_erase() takes the lock only optionally when it can't 
exchange the entry with NULL atomically (e.g. because a full page needs to be 
freed up or similar).

But in our use case taking the lock is mandatory because kref_get otherwise 
doesn't work.

Regards,
Christian.



Re: [PATCH v4] drm/amdgpu/userq: refcount userqueues to avoid any race conditions

2026-03-03 Thread Khatri, Sunil



On 03-03-2026 07:16 pm, Christian König wrote:

On 3/3/26 13:06, Sunil Khatri wrote:

To avoid race condition and avoid UAF cases, implement kref
based queues and protect the below operations using xa lock
a. Getting a queue from xarray
b. Increment/Decrement it's refcount

Every time some one want to access a queue, always get via
amdgpu_userq_get to make sure we have locks in place and get
the object if active.

A userqueue is destroyed on the last refcount is dropped which
typically would be via IOCTL or during fini.

v2: Add the missing drop in one the condition in the signal ioclt [Alex]

v3: remove the queue from the xarray first in the free queue ioctl path
 [Christian]

- Pass queue to the amdgpu_userq_put directly.
- make amdgpu_userq_put xa_lock free since we are doing put for each get
   only and final put is done via destroy and we remove the queue from xa
   with lock.
- use userq_put in fini too so cleanup is done fully.

v4: Use xa_erase directly rather than doing load and erase in free
 ioctl. Also remove some of the error logs which could be exploited
 by the user to flood the logs [Christian]

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 110 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |   4 +
  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  14 ++-
  3 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index e07b2082cf25..ed6a9d779d1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -446,8 +446,7 @@ static int amdgpu_userq_wait_for_last_fence(struct 
amdgpu_usermode_queue *queue)
return ret;
  }
  
-static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue,

-u32 queue_id)
+static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
  {
struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
@@ -461,7 +460,6 @@ static void amdgpu_userq_cleanup(struct 
amdgpu_usermode_queue *queue,
uq_funcs->mqd_destroy(queue);
amdgpu_userq_fence_driver_free(queue);
/* Use interrupt-safe locking since IRQ handlers may access these 
XArrays */
-   xa_erase_irq(&uq_mgr->userq_xa, queue_id);
xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
queue->userq_mgr = NULL;
list_del(&queue->userq_va_list);
@@ -470,12 +468,6 @@ static void amdgpu_userq_cleanup(struct 
amdgpu_usermode_queue *queue,
up_read(&adev->reset_domain->sem);
  }
  
-static struct amdgpu_usermode_queue *

-amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, u32 qid)
-{
-   return xa_load(&uq_mgr->userq_xa, qid);
-}
-
  void
  amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
 struct amdgpu_eviction_fence_mgr *evf_mgr)
@@ -625,22 +617,13 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr 
*uq_mgr,
  }
  
  static int

-amdgpu_userq_destroy(struct drm_file *filp, u32 queue_id)
+amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
amdgpu_usermode_queue *queue)
  {
-   struct amdgpu_fpriv *fpriv = filp->driver_priv;
-   struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
-   struct amdgpu_usermode_queue *queue;
int r = 0;
  
  	cancel_delayed_work_sync(&uq_mgr->resume_work);

mutex_lock(&uq_mgr->userq_mutex);
-   queue = amdgpu_userq_find(uq_mgr, queue_id);
-   if (!queue) {
-   drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to 
destroy\n");
-   mutex_unlock(&uq_mgr->userq_mutex);
-   return -EINVAL;
-   }
amdgpu_userq_wait_for_last_fence(queue);
/* Cancel any pending hang detection work and cleanup */
if (queue->hang_detect_fence) {
@@ -672,13 +655,44 @@ amdgpu_userq_destroy(struct drm_file *filp, u32 queue_id)
drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping 
userq\n");
queue->state = AMDGPU_USERQ_STATE_HUNG;
}
-   amdgpu_userq_cleanup(queue, queue_id);
+   amdgpu_userq_cleanup(queue);
mutex_unlock(&uq_mgr->userq_mutex);
  
  	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
  
  	return r;

  }
+static void amdgpu_userq_kref_destroy(struct kref *kref)
+{
+   int r;
+   struct amdgpu_usermode_queue *queue =
+   container_of(kref, struct amdgpu_usermode_queue, refcount);
+
+   struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+
+   r = amdgpu_userq_destroy(uq_mgr, queue);
+   if (r)
+   drm_file_err(uq_mgr->file, "Failed to destroy usermode 
queue\n");
+}
+
+struct amdgpu_usermode_queue *amdgpu_userq_get(struct amdgpu_userq_mgr 
*uq_mgr, u32 qid)
+{
+   struct amdgpu_usermode_queue *queue;

Re: [PATCH v4] drm/amdgpu/userq: refcount userqueues to avoid any race conditions

2026-03-03 Thread Christian König
On 3/3/26 13:06, Sunil Khatri wrote:
> To avoid race condition and avoid UAF cases, implement kref
> based queues and protect the below operations using xa lock
> a. Getting a queue from xarray
> b. Increment/Decrement it's refcount
> 
> Every time some one want to access a queue, always get via
> amdgpu_userq_get to make sure we have locks in place and get
> the object if active.
> 
> A userqueue is destroyed on the last refcount is dropped which
> typically would be via IOCTL or during fini.
> 
> v2: Add the missing drop in one the condition in the signal ioclt [Alex]
> 
> v3: remove the queue from the xarray first in the free queue ioctl path
> [Christian]
> 
> - Pass queue to the amdgpu_userq_put directly.
> - make amdgpu_userq_put xa_lock free since we are doing put for each get
>   only and final put is done via destroy and we remove the queue from xa
>   with lock.
> - use userq_put in fini too so cleanup is done fully.
> 
> v4: Use xa_erase directly rather than doing load and erase in free
> ioctl. Also remove some of the error logs which could be exploited
> by the user to flood the logs [Christian]
> 
> Signed-off-by: Sunil Khatri 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 110 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |   4 +
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  14 ++-
>  3 files changed, 91 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index e07b2082cf25..ed6a9d779d1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -446,8 +446,7 @@ static int amdgpu_userq_wait_for_last_fence(struct 
> amdgpu_usermode_queue *queue)
>   return ret;
>  }
>  
> -static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue,
> -  u32 queue_id)
> +static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
>  {
>   struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
>   struct amdgpu_device *adev = uq_mgr->adev;
> @@ -461,7 +460,6 @@ static void amdgpu_userq_cleanup(struct 
> amdgpu_usermode_queue *queue,
>   uq_funcs->mqd_destroy(queue);
>   amdgpu_userq_fence_driver_free(queue);
>   /* Use interrupt-safe locking since IRQ handlers may access these 
> XArrays */
> - xa_erase_irq(&uq_mgr->userq_xa, queue_id);
>   xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
>   queue->userq_mgr = NULL;
>   list_del(&queue->userq_va_list);
> @@ -470,12 +468,6 @@ static void amdgpu_userq_cleanup(struct 
> amdgpu_usermode_queue *queue,
>   up_read(&adev->reset_domain->sem);
>  }
>  
> -static struct amdgpu_usermode_queue *
> -amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, u32 qid)
> -{
> - return xa_load(&uq_mgr->userq_xa, qid);
> -}
> -
>  void
>  amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
>struct amdgpu_eviction_fence_mgr *evf_mgr)
> @@ -625,22 +617,13 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr 
> *uq_mgr,
>  }
>  
>  static int
> -amdgpu_userq_destroy(struct drm_file *filp, u32 queue_id)
> +amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
> amdgpu_usermode_queue *queue)
>  {
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> - struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
>   struct amdgpu_device *adev = uq_mgr->adev;
> - struct amdgpu_usermode_queue *queue;
>   int r = 0;
>  
>   cancel_delayed_work_sync(&uq_mgr->resume_work);
>   mutex_lock(&uq_mgr->userq_mutex);
> - queue = amdgpu_userq_find(uq_mgr, queue_id);
> - if (!queue) {
> - drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to 
> destroy\n");
> - mutex_unlock(&uq_mgr->userq_mutex);
> - return -EINVAL;
> - }
>   amdgpu_userq_wait_for_last_fence(queue);
>   /* Cancel any pending hang detection work and cleanup */
>   if (queue->hang_detect_fence) {
> @@ -672,13 +655,44 @@ amdgpu_userq_destroy(struct drm_file *filp, u32 
> queue_id)
>   drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW 
> mapping userq\n");
>   queue->state = AMDGPU_USERQ_STATE_HUNG;
>   }
> - amdgpu_userq_cleanup(queue, queue_id);
> + amdgpu_userq_cleanup(queue);
>   mutex_unlock(&uq_mgr->userq_mutex);
>  
>   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
>  
>   return r;
>  }
> +static void amdgpu_userq_kref_destroy(struct kref *kref)
> +{
> + int r;
> + struct amdgpu_usermode_queue *queue =
> + container_of(kref, struct amdgpu_usermode_queue, refcount);
> +
> + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
> +
> + r = amdgpu_userq_destroy(uq_mgr, queue);
> + if (r)
> + drm_file_err(uq_mgr->file, "Failed to destroy usermode 
> queue\n");
> +}
> +
> +struct amdgpu_usermode_queue *am

Re: [PATCH v4] drm/amdgpu/userq: refcount userqueues to avoid any race conditions

2026-03-03 Thread Khatri, Sunil



On 03-03-2026 05:38 pm, Sunil Khatri wrote:

To avoid race condition and avoid UAF cases, implement kref
based queues and protect the below operations using xa lock
a. Getting a queue from xarray
b. Increment/Decrement it's refcount

Every time some one want to access a queue, always get via
amdgpu_userq_get to make sure we have locks in place and get
the object if active.

A userqueue is destroyed on the last refcount is dropped which
typically would be via IOCTL or during fini.

v2: Add the missing drop in one the condition in the signal ioclt [Alex]

v3: remove the queue from the xarray first in the free queue ioctl path
 [Christian]

- Pass queue to the amdgpu_userq_put directly.
- make amdgpu_userq_put xa_lock free since we are doing put for each get
   only and final put is done via destroy and we remove the queue from xa
   with lock.
- use userq_put in fini too so cleanup is done fully.

v4: Use xa_erase directly rather than doing load and erase in free
 ioctl. Also remove some of the error logs which could be exploited
 by the user to flood the logs [Christian]

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 110 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |   4 +
  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  14 ++-
  3 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index e07b2082cf25..ed6a9d779d1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -446,8 +446,7 @@ static int amdgpu_userq_wait_for_last_fence(struct 
amdgpu_usermode_queue *queue)
return ret;
  }
  
-static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue,

-u32 queue_id)
+static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue)
  {
struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
@@ -461,7 +460,6 @@ static void amdgpu_userq_cleanup(struct 
amdgpu_usermode_queue *queue,
uq_funcs->mqd_destroy(queue);
amdgpu_userq_fence_driver_free(queue);
/* Use interrupt-safe locking since IRQ handlers may access these 
XArrays */
-   xa_erase_irq(&uq_mgr->userq_xa, queue_id);
xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
queue->userq_mgr = NULL;
list_del(&queue->userq_va_list);
@@ -470,12 +468,6 @@ static void amdgpu_userq_cleanup(struct 
amdgpu_usermode_queue *queue,
up_read(&adev->reset_domain->sem);
  }
  
-static struct amdgpu_usermode_queue *

-amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, u32 qid)
-{
-   return xa_load(&uq_mgr->userq_xa, qid);
-}
-
  void
  amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
 struct amdgpu_eviction_fence_mgr *evf_mgr)
@@ -625,22 +617,13 @@ amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr 
*uq_mgr,
  }
  
  static int

-amdgpu_userq_destroy(struct drm_file *filp, u32 queue_id)
+amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct 
amdgpu_usermode_queue *queue)
  {
-   struct amdgpu_fpriv *fpriv = filp->driver_priv;
-   struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
-   struct amdgpu_usermode_queue *queue;
int r = 0;
  
  	cancel_delayed_work_sync(&uq_mgr->resume_work);

mutex_lock(&uq_mgr->userq_mutex);
-   queue = amdgpu_userq_find(uq_mgr, queue_id);
-   if (!queue) {
-   drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to 
destroy\n");
-   mutex_unlock(&uq_mgr->userq_mutex);
-   return -EINVAL;
-   }
amdgpu_userq_wait_for_last_fence(queue);
/* Cancel any pending hang detection work and cleanup */
if (queue->hang_detect_fence) {
@@ -672,13 +655,44 @@ amdgpu_userq_destroy(struct drm_file *filp, u32 queue_id)
drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping 
userq\n");
queue->state = AMDGPU_USERQ_STATE_HUNG;
}
-   amdgpu_userq_cleanup(queue, queue_id);
+   amdgpu_userq_cleanup(queue);
mutex_unlock(&uq_mgr->userq_mutex);
  
  	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
  
  	return r;

  }
+static void amdgpu_userq_kref_destroy(struct kref *kref)
+{
+   int r;
+   struct amdgpu_usermode_queue *queue =
+   container_of(kref, struct amdgpu_usermode_queue, refcount);
+
+   struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+
+   r = amdgpu_userq_destroy(uq_mgr, queue);
+   if (r)
+   drm_file_err(uq_mgr->file, "Failed to destroy usermode 
queue\n");


Sorry missed to print r, will add in the next patch or if there is no 
further comments i will add this before pushing the code to amd-staging.


Regards

Sunil khatri


+}
+
+struct