On Fri, Jun 5, 2026 at 6:39 AM Jesse Zhang <[email protected]> wrote:
>
> KCQ and user queues never share a HW slot, so a KQ ring_id hit is
> authoritative. Match KQ first; only call into the userq path when
> no KQ owns the faulting slot.
>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 27d0a3dbfce8..1f159c9333a5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6688,10 +6688,12 @@ static void gfx_v11_0_handle_priv_fault(struct 
> amdgpu_device *adev,
>          */
>         u32 doorbell_offset = entry->src_data[0] & 0x3ff;
>
> -       if (adev->enable_mes && doorbell_offset) {
> -               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> -                                              doorbell_offset);
> -       } else if (!adev->gfx.disable_kq) {
> +       /*
> +        * Try KQ first by ring_id (HW slot is authoritative). The MES <->
> +        * KMD compute_hqd_mask contract guarantees KCQ and user queues
> +        * never share a HW slot.
> +        */
> +       if (!adev->gfx.disable_kq) {
>                 u8 me_id = (entry->ring_id & 0x0c) >> 2;
>                 u8 pipe_id = (entry->ring_id & 0x03) >> 0;
>                 u8 queue_id = (entry->ring_id & 0x70) >> 4;
> @@ -6703,8 +6705,10 @@ static void gfx_v11_0_handle_priv_fault(struct 
> amdgpu_device *adev,
>                         for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
>                                 ring = &adev->gfx.gfx_ring[i];
>                                 if (ring->me == me_id && ring->pipe == 
> pipe_id &&
> -                                   ring->queue == queue_id)
> +                                   ring->queue == queue_id) {
>                                         drm_sched_fault(&ring->sched);
> +                                       return;
> +                               }
>                         }
>                         break;
>                 case 1:
> @@ -6712,8 +6716,10 @@ static void gfx_v11_0_handle_priv_fault(struct 
> amdgpu_device *adev,
>                         for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>                                 ring = &adev->gfx.compute_ring[i];
>                                 if (ring->me == me_id && ring->pipe == 
> pipe_id &&
> -                                   ring->queue == queue_id)
> +                                   ring->queue == queue_id) {
>                                         drm_sched_fault(&ring->sched);
> +                                       return;
> +                               }
>                         }
>                         break;
>                 default:
> @@ -6721,6 +6727,11 @@ static void gfx_v11_0_handle_priv_fault(struct 
> amdgpu_device *adev,
>                         break;
>                 }
>         }
> +
> +       /* No KQ matched: HW slot is a MES-scheduled user queue. */
> +       if (adev->enable_mes && doorbell_offset)
> +               amdgpu_userq_process_reset_irq(adev, entry->pasid,
> +                                              doorbell_offset);

While you are at it, can you check if gfx_v11_0_eop_irq() and
gfx_v12_0_eop_irq() have similar issues?  Maybe those should be
reworked similarly.

Alex

>  }
>
>  static int gfx_v11_0_priv_reg_irq(struct amdgpu_device *adev,
> --
> 2.49.0
>

Reply via email to