On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote: > > KCQ and user queues never share a HW slot, so a KQ ring_id hit is > authoritative. Match KQ first; only call into the userq path when > no KQ owns the faulting slot. > > Signed-off-by: Jesse Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 +++++++++++++++++------ > 1 file changed, 17 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 27d0a3dbfce8..1f159c9333a5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -6688,10 +6688,12 @@ static void gfx_v11_0_handle_priv_fault(struct > amdgpu_device *adev, > */ > u32 doorbell_offset = entry->src_data[0] & 0x3ff; > > - if (adev->enable_mes && doorbell_offset) { > - amdgpu_userq_process_reset_irq(adev, entry->pasid, > - doorbell_offset); > - } else if (!adev->gfx.disable_kq) { > + /* > + * Try KQ first by ring_id (HW slot is authoritative). The MES <-> > + * KMD compute_hqd_mask contract guarantees KCQ and user queues > + * never share a HW slot. > + */ > + if (!adev->gfx.disable_kq) { > u8 me_id = (entry->ring_id & 0x0c) >> 2; > u8 pipe_id = (entry->ring_id & 0x03) >> 0; > u8 queue_id = (entry->ring_id & 0x70) >> 4; > @@ -6703,8 +6705,10 @@ static void gfx_v11_0_handle_priv_fault(struct > amdgpu_device *adev, > for (i = 0; i < adev->gfx.num_gfx_rings; i++) { > ring = &adev->gfx.gfx_ring[i]; > if (ring->me == me_id && ring->pipe == > pipe_id && > - ring->queue == queue_id) > + ring->queue == queue_id) { > drm_sched_fault(&ring->sched); > + return; > + } > } > break; > case 1: > @@ -6712,8 +6716,10 @@ static void gfx_v11_0_handle_priv_fault(struct > amdgpu_device *adev, > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = &adev->gfx.compute_ring[i]; > if (ring->me == me_id && ring->pipe == > pipe_id && > - ring->queue == queue_id) > + ring->queue == queue_id) { > drm_sched_fault(&ring->sched); > + return; > + } > } > break; > default: > @@ -6721,6 +6727,11 @@ static void gfx_v11_0_handle_priv_fault(struct > amdgpu_device *adev, > break; > } > } > + > + /* No KQ matched: HW slot is a MES-scheduled user queue. */ > + if (adev->enable_mes && doorbell_offset) > + amdgpu_userq_process_reset_irq(adev, entry->pasid, > + doorbell_offset);
While you are at it, can you check if gfx_v11_0_eop_irq() and gfx_v12_0_eop_irq() have similar issues? Maybe those should be reworked similarly. Alex > } > > static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev, > -- > 2.49.0 >
