On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote:
>
> CP priv-fault IRQ carries the offending queue's per-process
> doorbell_id in src_data[0][9:0] (same encoding KFD extracts via
> KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The
> existing lookup against BAR-absolute doorbell_index never matches
> this payload, so the targeted-reset path is never taken.
>
> Cache args->in.doorbell_offset on the queue and look it up by the
> (vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by
> the IRQ and consumed via xchg in hang_detect_work; on a hit, route
> compute reset through amdgpu_gfx_reset_mes_compute(..., queue, ...)
> and fire hang_detect immediately via mod_delayed_work(..., 0)
> (queue_delayed_work() is a no-op when the work is already armed at
> submit time).

I would reorder/squash these patches to add support for handling the
IH token format for this interrupt and then implement the interrupt
handling for the user queues.

Alex

>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37 ++++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  5 ++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c    | 10 ++++--
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c    | 10 ++++--
>  4 files changed, 49 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 99c5adf3cb24..231ffb29fe5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct 
> work_struct *work)
>                                                  
> AMDGPU_RESET_TYPE_PER_QUEUE)) {
>                 int r;
>
> -               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE)
> -                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, 
> NULL, NULL, NULL);
> -               else
> +               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) {
> +                       struct amdgpu_usermode_queue *guilty_uq;
> +
> +                       /* IRQ-side WRITE_ONCE(guilty,true) hand-off via xchg 
> */
> +                       guilty_uq = xchg(&queue->guilty, false) ? queue : 
> NULL;
> +                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL,
> +                                                        guilty_uq, NULL, 
> NULL);
> +               } else
>                         r = userq_funcs->reset(queue);
>                 if (r)
>                         gpu_reset = true;
> @@ -671,6 +676,7 @@ amdgpu_userq_create(struct drm_file *filp, union 
> drm_amdgpu_userq *args)
>         }
>
>         queue->doorbell_index = index;
> +       queue->doorbell_offset = (u32)args->in.doorbell_offset;
>         r = uq_funcs->mqd_create(queue, &args->in);
>         if (r) {
>                 drm_file_err(uq_mgr->file, "Failed to create Queue\n");
> @@ -1111,16 +1117,31 @@ static void amdgpu_userq_restore_worker(struct 
> work_struct *work)
>         dma_fence_put(ev_fence);
>  }
>
> -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell)
> +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
> +                                   u32 pasid, u32 doorbell_offset)
>  {
>         struct xarray *xa = &adev->userq_doorbell_xa;
>         struct amdgpu_usermode_queue *queue;
> -       unsigned long flags;
> +       unsigned long flags, idx;
>
> +       /*
> +        * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same
> +        * per-process doorbell encoding KFD extracts via
> +        * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the
> +        * (vm->pasid, doorbell_offset) pair, mark it guilty and fire
> +        * hang_detect immediately (queue_delayed_work() would no-op if the
> +        * work is already armed at submit time).
> +        */
>         xa_lock_irqsave(xa, flags);
> -       queue = xa_load(xa, doorbell);
> -       if (queue)
> -               amdgpu_userq_start_hang_detect_work(queue);
> +       xa_for_each(xa, idx, queue) {
> +               if (queue->vm && queue->vm->pasid == pasid &&
> +                   queue->doorbell_offset == doorbell_offset) {
> +                       WRITE_ONCE(queue->guilty, true);
> +                       mod_delayed_work(adev->reset_domain->wq,
> +                                        &queue->hang_detect_work, 0);
> +                       break;
> +               }
> +       }
>         xa_unlock_irqrestore(xa, flags);
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 5a2ae33135da..1e765cd765ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue {
>         enum amdgpu_userq_state state;
>         uint64_t                doorbell_handle;
>         uint64_t                doorbell_index;
> +       u32                     doorbell_offset;
>         uint64_t                flags;
>         struct amdgpu_mqd_prop  *userq_prop;
>         struct amdgpu_userq_mgr *userq_mgr;
> @@ -86,6 +87,7 @@ struct amdgpu_usermode_queue {
>          * Delayed work which runs when userq_fences time out.
>          */
>         struct delayed_work     hang_detect_work;
> +       bool                    guilty;
>         struct kref             refcount;
>
>         union {
> @@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
>  int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
>  void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue 
> *queue);
>  void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 
> doorbell);
> -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 
> doorbell);
> +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
> +                                   u32 pasid, u32 doorbell_offset);
>
>  int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
>                                    struct amdgpu_usermode_queue *queue,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index ea323d241324..27d0a3dbfce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6681,10 +6681,16 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct 
> amdgpu_device *adev,
>  static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
>                                         struct amdgpu_iv_entry *entry)
>  {
> -       u32 doorbell_offset = entry->src_data[0];
> +       /*
> +        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
> +        * bits hold other CTXID0 fields. Same mask KFD uses
> +        * (KFD_CTXID0_DOORBELL_ID_MASK).
> +        */
> +       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
>
>         if (adev->enable_mes && doorbell_offset) {
> -               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
> +               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> +                                              doorbell_offset);
>         } else if (!adev->gfx.disable_kq) {
>                 u8 me_id = (entry->ring_id & 0x0c) >> 2;
>                 u8 pipe_id = (entry->ring_id & 0x03) >> 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index b3f1bbf3fc13..7c269cb75e07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -5009,10 +5009,16 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct 
> amdgpu_device *adev,
>  static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
>                                         struct amdgpu_iv_entry *entry)
>  {
> -       u32 doorbell_offset = entry->src_data[0];
> +       /*
> +        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
> +        * bits hold other CTXID0 fields. Same mask KFD uses
> +        * (KFD_CTXID0_DOORBELL_ID_MASK).
> +        */
> +       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
>
>         if (adev->enable_mes && doorbell_offset) {
> -               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
> +               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> +                                              doorbell_offset);
>         } else if (!adev->gfx.disable_kq) {
>                 u8 me_id, pipe_id, queue_id;
>                 struct amdgpu_ring *ring;
> --
> 2.49.0
>

Reply via email to