On Fri, Mar 20, 2026 at 4:02 PM Amber Lin <[email protected]> wrote:
>
> Identify hung queues by comparing doorbells shown in hqd_info from MES
> with doorbells stored in the driver to find matching queues.
>
> Suggested-by: Jonathan Kim <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>

Reviewed-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 38 ++++++++++++++++---------
>  1 file changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index b68bf4a9cb40..bea509f6b3ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -465,23 +465,35 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
> amdgpu_device *adev,
>
>         r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
>                                                           &input);
> -       if (r) {
> -               dev_err(adev->dev, "failed to detect and reset\n");
> -       } else {
> -               *hung_db_num = 0;
> -               for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> -                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
> -                               hung_db_array[i] = db_array[i];
> -                               *hung_db_num += 1;
> -                       }
> +
> +       if (r && detect_only) {
> +               dev_err(adev->dev, "Failed to detect hung queues\n");
> +               return r;
> +       }
> +
> +       *hung_db_num = 0;
> +       /* MES passes hung queues' doorbell to driver */
> +       for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> +               /* Finding hung queues where db_array[i] is a valid doorbell 
> */
> +               if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
> +                       hung_db_array[i] = db_array[i];
> +                       *hung_db_num += 1;
>                 }
> +       }
>
> -               /*
> -                * TODO: return HQD info for MES scheduled user compute queue 
> reset cases
> -                * stored in hung_db_array hqd info offset to full array size
> -                */
> +       if (r && !hung_db_num) {
> +               dev_err(adev->dev, "Failed to detect and reset hung 
> queues\n");
> +               return r;
>         }
>
> +       /*
> +        * TODO: return HQD info for MES scheduled user compute queue reset 
> cases
> +        * stored in hung_db_array hqd info offset to full array size
> +        */
> +
> +       if (r)
> +               dev_err(adev->dev, "failed to reset\n");
> +
>         return r;
>  }
>
> --
> 2.43.0
>

Reply via email to