CP priv-fault IRQ carries the offending queue's per-process doorbell_id in src_data[0][9:0] (same encoding KFD extracts via KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The existing lookup against BAR-absolute doorbell_index never matches this payload, so the targeted-reset path is never taken.
Cache args->in.doorbell_offset on the queue and look it up by the (vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by the IRQ and consumed via xchg in hang_detect_work; on a hit, route compute reset through amdgpu_gfx_reset_mes_compute(..., queue, ...) and fire hang_detect immediately via mod_delayed_work(..., 0) (queue_delayed_work() is a no-op when the work is already armed at submit time). Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37 ++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 ++- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 10 ++++-- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++-- 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 99c5adf3cb24..231ffb29fe5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work) AMDGPU_RESET_TYPE_PER_QUEUE)) { int r; - if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) - r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, NULL, NULL, NULL); - else + if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) { + struct amdgpu_usermode_queue *guilty_uq; + + /* IRQ-side WRITE_ONCE(guilty,true) hand-off via xchg */ + guilty_uq = xchg(&queue->guilty, false) ? queue : NULL; + r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, + guilty_uq, NULL, NULL); + } else r = userq_funcs->reset(queue); if (r) gpu_reset = true; @@ -671,6 +676,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) } queue->doorbell_index = index; + queue->doorbell_offset = (u32)args->in.doorbell_offset; r = uq_funcs->mqd_create(queue, &args->in); if (r) { drm_file_err(uq_mgr->file, "Failed to create Queue\n"); @@ -1111,16 +1117,31 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) dma_fence_put(ev_fence); } -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell) +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, + u32 pasid, u32 doorbell_offset) { struct xarray *xa = &adev->userq_doorbell_xa; struct amdgpu_usermode_queue *queue; - unsigned long flags; + unsigned long flags, idx; + /* + * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same + * per-process doorbell encoding KFD extracts via + * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the + * (vm->pasid, doorbell_offset) pair, mark it guilty and fire + * hang_detect immediately (queue_delayed_work() would no-op if the + * work is already armed at submit time). + */ xa_lock_irqsave(xa, flags); - queue = xa_load(xa, doorbell); - if (queue) - amdgpu_userq_start_hang_detect_work(queue); + xa_for_each(xa, idx, queue) { + if (queue->vm && queue->vm->pasid == pasid && + queue->doorbell_offset == doorbell_offset) { + WRITE_ONCE(queue->guilty, true); + mod_delayed_work(adev->reset_domain->wq, + &queue->hang_detect_work, 0); + break; + } + } xa_unlock_irqrestore(xa, flags); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 5a2ae33135da..1e765cd765ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -53,6 +53,7 @@ struct amdgpu_usermode_queue { enum amdgpu_userq_state state; uint64_t doorbell_handle; uint64_t doorbell_index; + u32 doorbell_offset; uint64_t flags; struct amdgpu_mqd_prop *userq_prop; struct amdgpu_userq_mgr *userq_mgr; @@ -86,6 +87,7 @@ struct amdgpu_usermode_queue { * Delayed work which runs when userq_fences time out. */ struct delayed_work hang_detect_work; + bool guilty; struct kref refcount; union { @@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev); int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue); void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell); -void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell); +void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, + u32 pasid, u32 doorbell_offset); int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, struct amdgpu_usermode_queue *queue, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index ea323d241324..27d0a3dbfce8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6681,10 +6681,16 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { - u32 doorbell_offset = entry->src_data[0]; + /* + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper + * bits hold other CTXID0 fields. Same mask KFD uses + * (KFD_CTXID0_DOORBELL_ID_MASK). + */ + u32 doorbell_offset = entry->src_data[0] & 0x3ff; if (adev->enable_mes && doorbell_offset) { - amdgpu_userq_process_reset_irq(adev, doorbell_offset); + amdgpu_userq_process_reset_irq(adev, entry->pasid, + doorbell_offset); } else if (!adev->gfx.disable_kq) { u8 me_id = (entry->ring_id & 0x0c) >> 2; u8 pipe_id = (entry->ring_id & 0x03) >> 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index b3f1bbf3fc13..7c269cb75e07 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -5009,10 +5009,16 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct amdgpu_device *adev, static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { - u32 doorbell_offset = entry->src_data[0]; + /* + * CP packs the per-process doorbell_id in src_data[0][9:0]; upper + * bits hold other CTXID0 fields. Same mask KFD uses + * (KFD_CTXID0_DOORBELL_ID_MASK). + */ + u32 doorbell_offset = entry->src_data[0] & 0x3ff; if (adev->enable_mes && doorbell_offset) { - amdgpu_userq_process_reset_irq(adev, doorbell_offset); + amdgpu_userq_process_reset_irq(adev, entry->pasid, + doorbell_offset); } else if (!adev->gfx.disable_kq) { u8 me_id, pipe_id, queue_id; struct amdgpu_ring *ring; -- 2.49.0
