On Tue, Jan 13, 2026 at 3:42 AM Jesse.Zhang <[email protected]> wrote:
>
> In error scenarios (e.g., malformed commands), user queue fences may never
> be signaled, causing processes to wait indefinitely. To address this while
> preserving the requirement of infinite fence waits, implement an independent
> timeout detection mechanism:
>
> 1. Initialize a hang detect work when creating a user queue (one-time setup)
> 2. Start the work with queue-type-specific timeout (gfx/compute/sdma) when
>        the last fence is created via amdgpu_userq_signal_ioctl (per-fence 
> timing)
> 3. Trigger queue reset logic if the timer expires before the fence is signaled
>
> v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout 
> vs adev->sdma_timeout) to be consistent with kernel queues. (Alex)
> v3: The timeout detection must be independent from the fence, e.g. you don't 
> wait for a timeout on the fence
>         but rather have the timeout start as soon as the fence is 
> initialized. (Christian)
> v4: replace the timer with the `hang_detect_work` delayed work.
>
> Signed-off-by: Jesse Zhang <[email protected]>

Reviewed-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 70 ++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |  3 +
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  1 +
>  3 files changed, 73 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 98110f543307..664a15278c1d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -148,6 +148,69 @@ amdgpu_userq_detect_and_reset_queues(struct 
> amdgpu_userq_mgr *uq_mgr)
>         return r;
>  }
>
> +static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> +{
> +       struct amdgpu_usermode_queue *queue = container_of(work,
> +                                                         struct 
> amdgpu_usermode_queue,
> +                                                         
> hang_detect_work.work);
> +       struct dma_fence *fence;
> +       struct amdgpu_userq_mgr *uq_mgr;
> +
> +       if (!queue || !queue->userq_mgr)
> +               return;
> +
> +       uq_mgr = queue->userq_mgr;
> +       fence = READ_ONCE(queue->hang_detect_fence);
> +       /* Fence already signaled – no action needed */
> +       if (!fence || dma_fence_is_signaled(fence))
> +               return;
> +
> +       mutex_lock(&uq_mgr->userq_mutex);
> +       amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +       mutex_unlock(&uq_mgr->userq_mutex);
> +}
> +
> +/*
> + * Start hang detection for a user queue fence. A delayed work will be 
> scheduled
> + * to check if the fence is still pending after the timeout period.
> +*/
> +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
> +{
> +       struct amdgpu_device *adev;
> +       unsigned long timeout_ms;
> +
> +       if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
> +               return;
> +
> +       adev = queue->userq_mgr->adev;
> +       /* Determine timeout based on queue type */
> +       switch (queue->queue_type) {
> +       case AMDGPU_RING_TYPE_GFX:
> +               timeout_ms = adev->gfx_timeout;
> +               break;
> +       case AMDGPU_RING_TYPE_COMPUTE:
> +               timeout_ms = adev->compute_timeout;
> +               break;
> +       case AMDGPU_RING_TYPE_SDMA:
> +               timeout_ms = adev->sdma_timeout;
> +               break;
> +       default:
> +               timeout_ms = adev->gfx_timeout;
> +               break;
> +       }
> +
> +       /* Store the fence to monitor and schedule hang detection */
> +       WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
> +       schedule_delayed_work(&queue->hang_detect_work,
> +                    msecs_to_jiffies(timeout_ms));
> +}
> +
> +static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue 
> *queue)
> +{
> +       INIT_DELAYED_WORK(&queue->hang_detect_work, 
> amdgpu_userq_hang_detect_work);
> +       queue->hang_detect_fence = NULL;
> +}
> +
>  static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue 
> *queue,
>                                            struct amdgpu_bo_va_mapping 
> *va_map, u64 addr)
>  {
> @@ -572,7 +635,6 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>
>         cancel_delayed_work_sync(&uq_mgr->resume_work);
>         mutex_lock(&uq_mgr->userq_mutex);
> -
>         queue = amdgpu_userq_find(uq_mgr, queue_id);
>         if (!queue) {
>                 drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id 
> to destroy\n");
> @@ -580,6 +642,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>                 return -EINVAL;
>         }
>         amdgpu_userq_wait_for_last_fence(queue);
> +       /* Cancel any pending hang detection work and cleanup */
> +       if (queue->hang_detect_fence) {
> +               cancel_delayed_work_sync(&queue->hang_detect_work);
> +               queue->hang_detect_fence = NULL;
> +       }
>         r = amdgpu_bo_reserve(queue->db_obj.obj, true);
>         if (!r) {
>                 amdgpu_bo_unpin(queue->db_obj.obj);
> @@ -818,6 +885,7 @@ amdgpu_userq_create(struct drm_file *filp, union 
> drm_amdgpu_userq *args)
>         queue->debugfs_queue = debugfs_create_dir(queue_name, 
> filp->debugfs_client);
>         debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, 
> &amdgpu_mqd_info_fops);
>  #endif
> +       amdgpu_userq_init_hang_detect_work(queue);
>         kfree(queue_name);
>
>         args->out.queue_id = qid;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 1eaa94f8a291..06a06272b41a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -72,6 +72,8 @@ struct amdgpu_usermode_queue {
>         u32                     xcp_id;
>         int                     priority;
>         struct dentry           *debugfs_queue;
> +       struct delayed_work hang_detect_work;
> +       struct dma_fence *hang_detect_fence;
>
>         struct list_head        userq_va_list;
>  };
> @@ -146,6 +148,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct 
> amdgpu_device *adev,
>  void amdgpu_userq_reset_work(struct work_struct *work);
>  void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
>  int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue 
> *queue);
>
>  int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
>                                    struct amdgpu_usermode_queue *queue,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 25f178536469..374fbd0e859a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -569,6 +569,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, 
> void *data,
>
>         dma_fence_put(queue->last_fence);
>         queue->last_fence = dma_fence_get(fence);
> +       amdgpu_userq_start_hang_detect_work(queue);
>         mutex_unlock(&userq_mgr->userq_mutex);
>
>         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
> --
> 2.49.0
>

Reply via email to