AMD General

> -----Original Message-----
> From: Alex Deucher <[email protected]>
> Sent: Saturday, June 6, 2026 4:14 AM
> To: Zhang, Jesse(Jie) <[email protected]>
> Cc: [email protected]; Deucher, Alexander
> <[email protected]>; Koenig, Christian <[email protected]>
> Subject: Re: [PATCH 4/6] drm/amdgpu/userq: identify guilty user queue from 
> priv-
> fault IRQ
>
> On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote:
> >
> > CP priv-fault IRQ carries the offending queue's per-process
> > doorbell_id in src_data[0][9:0] (same encoding KFD extracts via
> > KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The existing
> > lookup against BAR-absolute doorbell_index never matches this payload,
> > so the targeted-reset path is never taken.
> >
> > Cache args->in.doorbell_offset on the queue and look it up by the
> > (vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by the
> > IRQ and consumed via xchg in hang_detect_work; on a hit, route compute
> > reset through amdgpu_gfx_reset_mes_compute(..., queue, ...) and fire
> > hang_detect immediately via mod_delayed_work(..., 0)
> > (queue_delayed_work() is a no-op when the work is already armed at
> > submit time).
>
> I would reorder/squash these patches to add support for handling the IH token
> format for this interrupt and then implement the interrupt handling for the 
> user
> queues.

I will do that at V2 .

Thanks
Jesse
>
> Alex
>
> >
> > Signed-off-by: Jesse Zhang <[email protected]>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37
> > ++++++++++++++++++-----  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  5
> ++-
> >  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c    | 10 ++++--
> >  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c    | 10 ++++--
> >  4 files changed, 49 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index 99c5adf3cb24..231ffb29fe5e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct
> work_struct *work)
> >                                                  
> > AMDGPU_RESET_TYPE_PER_QUEUE)) {
> >                 int r;
> >
> > -               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE)
> > -                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, 
> > NULL,
> NULL, NULL);
> > -               else
> > +               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) {
> > +                       struct amdgpu_usermode_queue *guilty_uq;
> > +
> > +                       /* IRQ-side WRITE_ONCE(guilty,true) hand-off via 
> > xchg */
> > +                       guilty_uq = xchg(&queue->guilty, false) ? queue : 
> > NULL;
> > +                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL,
> > +                                                        guilty_uq, NULL, 
> > NULL);
> > +               } else
> >                         r = userq_funcs->reset(queue);
> >                 if (r)
> >                         gpu_reset = true; @@ -671,6 +676,7 @@
> > amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
> >         }
> >
> >         queue->doorbell_index = index;
> > +       queue->doorbell_offset = (u32)args->in.doorbell_offset;
> >         r = uq_funcs->mqd_create(queue, &args->in);
> >         if (r) {
> >                 drm_file_err(uq_mgr->file, "Failed to create
> > Queue\n"); @@ -1111,16 +1117,31 @@ static void
> amdgpu_userq_restore_worker(struct work_struct *work)
> >         dma_fence_put(ev_fence);
> >  }
> >
> > -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32
> > doorbell)
> > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
> > +                                   u32 pasid, u32 doorbell_offset)
> >  {
> >         struct xarray *xa = &adev->userq_doorbell_xa;
> >         struct amdgpu_usermode_queue *queue;
> > -       unsigned long flags;
> > +       unsigned long flags, idx;
> >
> > +       /*
> > +        * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same
> > +        * per-process doorbell encoding KFD extracts via
> > +        * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the
> > +        * (vm->pasid, doorbell_offset) pair, mark it guilty and fire
> > +        * hang_detect immediately (queue_delayed_work() would no-op if the
> > +        * work is already armed at submit time).
> > +        */
> >         xa_lock_irqsave(xa, flags);
> > -       queue = xa_load(xa, doorbell);
> > -       if (queue)
> > -               amdgpu_userq_start_hang_detect_work(queue);
> > +       xa_for_each(xa, idx, queue) {
> > +               if (queue->vm && queue->vm->pasid == pasid &&
> > +                   queue->doorbell_offset == doorbell_offset) {
> > +                       WRITE_ONCE(queue->guilty, true);
> > +                       mod_delayed_work(adev->reset_domain->wq,
> > +                                        &queue->hang_detect_work, 0);
> > +                       break;
> > +               }
> > +       }
> >         xa_unlock_irqrestore(xa, flags);  }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > index 5a2ae33135da..1e765cd765ab 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> > @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue {
> >         enum amdgpu_userq_state state;
> >         uint64_t                doorbell_handle;
> >         uint64_t                doorbell_index;
> > +       u32                     doorbell_offset;
> >         uint64_t                flags;
> >         struct amdgpu_mqd_prop  *userq_prop;
> >         struct amdgpu_userq_mgr *userq_mgr; @@ -86,6 +87,7 @@ struct
> > amdgpu_usermode_queue {
> >          * Delayed work which runs when userq_fences time out.
> >          */
> >         struct delayed_work     hang_detect_work;
> > +       bool                    guilty;
> >         struct kref             refcount;
> >
> >         union {
> > @@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device
> > *adev);  int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool
> > vram_lost);  void amdgpu_userq_start_hang_detect_work(struct
> > amdgpu_usermode_queue *queue);  void
> > amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32
> > doorbell); -void amdgpu_userq_process_reset_irq(struct amdgpu_device
> > *adev, u32 doorbell);
> > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
> > +                                   u32 pasid, u32 doorbell_offset);
> >
> >  int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
> >                                    struct amdgpu_usermode_queue
> > *queue, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index ea323d241324..27d0a3dbfce8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -6681,10 +6681,16 @@ static int
> > gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev,  static void
> gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
> >                                         struct amdgpu_iv_entry *entry)
> > {
> > -       u32 doorbell_offset = entry->src_data[0];
> > +       /*
> > +        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
> > +        * bits hold other CTXID0 fields. Same mask KFD uses
> > +        * (KFD_CTXID0_DOORBELL_ID_MASK).
> > +        */
> > +       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
> >
> >         if (adev->enable_mes && doorbell_offset) {
> > -               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
> > +               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> > +                                              doorbell_offset);
> >         } else if (!adev->gfx.disable_kq) {
> >                 u8 me_id = (entry->ring_id & 0x0c) >> 2;
> >                 u8 pipe_id = (entry->ring_id & 0x03) >> 0; diff --git
> > a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > index b3f1bbf3fc13..7c269cb75e07 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > @@ -5009,10 +5009,16 @@ static int
> > gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev,  static void
> gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
> >                                         struct amdgpu_iv_entry *entry)
> > {
> > -       u32 doorbell_offset = entry->src_data[0];
> > +       /*
> > +        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
> > +        * bits hold other CTXID0 fields. Same mask KFD uses
> > +        * (KFD_CTXID0_DOORBELL_ID_MASK).
> > +        */
> > +       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
> >
> >         if (adev->enable_mes && doorbell_offset) {
> > -               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
> > +               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> > +                                              doorbell_offset);
> >         } else if (!adev->gfx.disable_kq) {
> >                 u8 me_id, pipe_id, queue_id;
> >                 struct amdgpu_ring *ring;
> > --
> > 2.49.0
> >

Reply via email to