AMD General > -----Original Message----- > From: Alex Deucher <[email protected]> > Sent: Saturday, June 6, 2026 4:14 AM > To: Zhang, Jesse(Jie) <[email protected]> > Cc: [email protected]; Deucher, Alexander > <[email protected]>; Koenig, Christian <[email protected]> > Subject: Re: [PATCH 4/6] drm/amdgpu/userq: identify guilty user queue from > priv- > fault IRQ > > On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote: > > > > CP priv-fault IRQ carries the offending queue's per-process > > doorbell_id in src_data[0][9:0] (same encoding KFD extracts via > > KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The existing > > lookup against BAR-absolute doorbell_index never matches this payload, > > so the targeted-reset path is never taken. > > > > Cache args->in.doorbell_offset on the queue and look it up by the > > (vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by the > > IRQ and consumed via xchg in hang_detect_work; on a hit, route compute > > reset through amdgpu_gfx_reset_mes_compute(..., queue, ...) and fire > > hang_detect immediately via mod_delayed_work(..., 0) > > (queue_delayed_work() is a no-op when the work is already armed at > > submit time). > > I would reorder/squash these patches to add support for handling the IH token > format for this interrupt and then implement the interrupt handling for the > user > queues.
I will do that at V2 . Thanks Jesse > > Alex > > > > > Signed-off-by: Jesse Zhang <[email protected]> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37 > > ++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 > ++- > > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 10 ++++-- > > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++-- > > 4 files changed, 49 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > index 99c5adf3cb24..231ffb29fe5e 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > > @@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct > work_struct *work) > > > > AMDGPU_RESET_TYPE_PER_QUEUE)) { > > int r; > > > > - if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) > > - r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, > > NULL, > NULL, NULL); > > - else > > + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) { > > + struct amdgpu_usermode_queue *guilty_uq; > > + > > + /* IRQ-side WRITE_ONCE(guilty,true) hand-off via > > xchg */ > > + guilty_uq = xchg(&queue->guilty, false) ? queue : > > NULL; > > + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, > > + guilty_uq, NULL, > > NULL); > > + } else > > r = userq_funcs->reset(queue); > > if (r) > > gpu_reset = true; @@ -671,6 +676,7 @@ > > amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) > > } > > > > queue->doorbell_index = index; > > + queue->doorbell_offset = (u32)args->in.doorbell_offset; > > r = uq_funcs->mqd_create(queue, &args->in); > > if (r) { > > drm_file_err(uq_mgr->file, "Failed to create > > Queue\n"); @@ -1111,16 +1117,31 @@ static void > amdgpu_userq_restore_worker(struct work_struct *work) > > dma_fence_put(ev_fence); > > } > > > > -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 > > doorbell) > > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, > > + u32 pasid, u32 doorbell_offset) > > { > > struct xarray *xa = &adev->userq_doorbell_xa; > > struct amdgpu_usermode_queue *queue; > > - unsigned long flags; > > + unsigned long flags, idx; > > > > + /* > > + * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same > > + * per-process doorbell encoding KFD extracts via > > + * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the > > + * (vm->pasid, doorbell_offset) pair, mark it guilty and fire > > + * hang_detect immediately (queue_delayed_work() would no-op if the > > + * work is already armed at submit time). > > + */ > > xa_lock_irqsave(xa, flags); > > - queue = xa_load(xa, doorbell); > > - if (queue) > > - amdgpu_userq_start_hang_detect_work(queue); > > + xa_for_each(xa, idx, queue) { > > + if (queue->vm && queue->vm->pasid == pasid && > > + queue->doorbell_offset == doorbell_offset) { > > + WRITE_ONCE(queue->guilty, true); > > + mod_delayed_work(adev->reset_domain->wq, > > + &queue->hang_detect_work, 0); > > + break; > > + } > > + } > > xa_unlock_irqrestore(xa, flags); } > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > > index 5a2ae33135da..1e765cd765ab 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > > @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue { > > enum amdgpu_userq_state state; > > uint64_t doorbell_handle; > > uint64_t doorbell_index; > > + u32 doorbell_offset; > > uint64_t flags; > > struct amdgpu_mqd_prop *userq_prop; > > struct amdgpu_userq_mgr *userq_mgr; @@ -86,6 +87,7 @@ struct > > amdgpu_usermode_queue { > > * Delayed work which runs when userq_fences time out. > > */ > > struct delayed_work hang_detect_work; > > + bool guilty; > > struct kref refcount; > > > > union { > > @@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device > > *adev); int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool > > vram_lost); void amdgpu_userq_start_hang_detect_work(struct > > amdgpu_usermode_queue *queue); void > > amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 > > doorbell); -void amdgpu_userq_process_reset_irq(struct amdgpu_device > > *adev, u32 doorbell); > > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, > > + u32 pasid, u32 doorbell_offset); > > > > int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, > > struct amdgpu_usermode_queue > > *queue, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > index ea323d241324..27d0a3dbfce8 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > @@ -6681,10 +6681,16 @@ static int > > gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void > gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, > > struct amdgpu_iv_entry *entry) > > { > > - u32 doorbell_offset = entry->src_data[0]; > > + /* > > + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper > > + * bits hold other CTXID0 fields. Same mask KFD uses > > + * (KFD_CTXID0_DOORBELL_ID_MASK). > > + */ > > + u32 doorbell_offset = entry->src_data[0] & 0x3ff; > > > > if (adev->enable_mes && doorbell_offset) { > > - amdgpu_userq_process_reset_irq(adev, doorbell_offset); > > + amdgpu_userq_process_reset_irq(adev, entry->pasid, > > + doorbell_offset); > > } else if (!adev->gfx.disable_kq) { > > u8 me_id = (entry->ring_id & 0x0c) >> 2; > > u8 pipe_id = (entry->ring_id & 0x03) >> 0; diff --git > > a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > > index b3f1bbf3fc13..7c269cb75e07 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > > @@ -5009,10 +5009,16 @@ static int > > gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void > gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, > > struct amdgpu_iv_entry *entry) > > { > > - u32 doorbell_offset = entry->src_data[0]; > > + /* > > + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper > > + * bits hold other CTXID0 fields. Same mask KFD uses > > + * (KFD_CTXID0_DOORBELL_ID_MASK). > > + */ > > + u32 doorbell_offset = entry->src_data[0] & 0x3ff; > > > > if (adev->enable_mes && doorbell_offset) { > > - amdgpu_userq_process_reset_irq(adev, doorbell_offset); > > + amdgpu_userq_process_reset_irq(adev, entry->pasid, > > + doorbell_offset); > > } else if (!adev->gfx.disable_kq) { > > u8 me_id, pipe_id, queue_id; > > struct amdgpu_ring *ring; > > -- > > 2.49.0 > >
