On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote: > > CP priv-fault IRQ carries the offending queue's per-process > doorbell_id in src_data[0][9:0] (same encoding KFD extracts via > KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The > existing lookup against BAR-absolute doorbell_index never matches > this payload, so the targeted-reset path is never taken. > > Cache args->in.doorbell_offset on the queue and look it up by the > (vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by > the IRQ and consumed via xchg in hang_detect_work; on a hit, route > compute reset through amdgpu_gfx_reset_mes_compute(..., queue, ...) > and fire hang_detect immediately via mod_delayed_work(..., 0) > (queue_delayed_work() is a no-op when the work is already armed at > submit time).
I would reorder/squash these patches to add support for handling the IH token format for this interrupt and then implement the interrupt handling for the user queues. Alex > > Signed-off-by: Jesse Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37 ++++++++++++++++++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 ++- > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 10 ++++-- > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++-- > 4 files changed, 49 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 99c5adf3cb24..231ffb29fe5e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct > work_struct *work) > > AMDGPU_RESET_TYPE_PER_QUEUE)) { > int r; > > - if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) > - r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, > NULL, NULL, NULL); > - else > + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) { > + struct amdgpu_usermode_queue *guilty_uq; > + > + /* IRQ-side WRITE_ONCE(guilty,true) hand-off via xchg > */ > + guilty_uq = xchg(&queue->guilty, false) ? queue : > NULL; > + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, > + guilty_uq, NULL, > NULL); > + } else > r = userq_funcs->reset(queue); > if (r) > gpu_reset = true; > @@ -671,6 +676,7 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > } > > queue->doorbell_index = index; > + queue->doorbell_offset = (u32)args->in.doorbell_offset; > r = uq_funcs->mqd_create(queue, &args->in); > if (r) { > drm_file_err(uq_mgr->file, "Failed to create Queue\n"); > @@ -1111,16 +1117,31 @@ static void amdgpu_userq_restore_worker(struct > work_struct *work) > dma_fence_put(ev_fence); > } > > -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell) > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, > + u32 pasid, u32 doorbell_offset) > { > struct xarray *xa = &adev->userq_doorbell_xa; > struct amdgpu_usermode_queue *queue; > - unsigned long flags; > + unsigned long flags, idx; > > + /* > + * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same > + * per-process doorbell encoding KFD extracts via > + * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the > + * (vm->pasid, doorbell_offset) pair, mark it guilty and fire > + * hang_detect immediately (queue_delayed_work() would no-op if the > + * work is already armed at submit time). > + */ > xa_lock_irqsave(xa, flags); > - queue = xa_load(xa, doorbell); > - if (queue) > - amdgpu_userq_start_hang_detect_work(queue); > + xa_for_each(xa, idx, queue) { > + if (queue->vm && queue->vm->pasid == pasid && > + queue->doorbell_offset == doorbell_offset) { > + WRITE_ONCE(queue->guilty, true); > + mod_delayed_work(adev->reset_domain->wq, > + &queue->hang_detect_work, 0); > + break; > + } > + } > xa_unlock_irqrestore(xa, flags); > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 5a2ae33135da..1e765cd765ab 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue { > enum amdgpu_userq_state state; > uint64_t doorbell_handle; > uint64_t doorbell_index; > + u32 doorbell_offset; > uint64_t flags; > struct amdgpu_mqd_prop *userq_prop; > struct amdgpu_userq_mgr *userq_mgr; > @@ -86,6 +87,7 @@ struct amdgpu_usermode_queue { > * Delayed work which runs when userq_fences time out. > */ > struct delayed_work hang_detect_work; > + bool guilty; > struct kref refcount; > > union { > @@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev); > int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue > *queue); > void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 > doorbell); > -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 > doorbell); > +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, > + u32 pasid, u32 doorbell_offset); > > int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, > struct amdgpu_usermode_queue *queue, > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index ea323d241324..27d0a3dbfce8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -6681,10 +6681,16 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct > amdgpu_device *adev, > static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, > struct amdgpu_iv_entry *entry) > { > - u32 doorbell_offset = entry->src_data[0]; > + /* > + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper > + * bits hold other CTXID0 fields. Same mask KFD uses > + * (KFD_CTXID0_DOORBELL_ID_MASK). > + */ > + u32 doorbell_offset = entry->src_data[0] & 0x3ff; > > if (adev->enable_mes && doorbell_offset) { > - amdgpu_userq_process_reset_irq(adev, doorbell_offset); > + amdgpu_userq_process_reset_irq(adev, entry->pasid, > + doorbell_offset); > } else if (!adev->gfx.disable_kq) { > u8 me_id = (entry->ring_id & 0x0c) >> 2; > u8 pipe_id = (entry->ring_id & 0x03) >> 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > index b3f1bbf3fc13..7c269cb75e07 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > @@ -5009,10 +5009,16 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct > amdgpu_device *adev, > static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, > struct amdgpu_iv_entry *entry) > { > - u32 doorbell_offset = entry->src_data[0]; > + /* > + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper > + * bits hold other CTXID0 fields. Same mask KFD uses > + * (KFD_CTXID0_DOORBELL_ID_MASK). > + */ > + u32 doorbell_offset = entry->src_data[0] & 0x3ff; > > if (adev->enable_mes && doorbell_offset) { > - amdgpu_userq_process_reset_irq(adev, doorbell_offset); > + amdgpu_userq_process_reset_irq(adev, entry->pasid, > + doorbell_offset); > } else if (!adev->gfx.disable_kq) { > u8 me_id, pipe_id, queue_id; > struct amdgpu_ring *ring; > -- > 2.49.0 >
