On Fri, Mar 27, 2026 at 4:33 PM Amber Lin <[email protected]> wrote:
>
> Create hung_queue_hqd_info structure and fill in hung queses information
> passed by MES, including queue type, pipe id, and queue id.
>
> Suggested-by: Jonathan Kim <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 17 +++++++++--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 13 +++++++++++++
>  2 files changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f1f8bbfc31e0..436a46ba1dfa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -447,7 +447,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
> amdgpu_device *adev,
>  {
>         struct mes_detect_and_reset_queue_input input;
>         u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr[xcc_id];
> -       int r, i;
> +       int hqd_info_offset = adev->mes.hung_queue_hqd_info_offset, r, i;
>
>         if (!hung_db_num || !hung_db_array)
>                 return -EINVAL;
> @@ -471,6 +471,12 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
> amdgpu_device *adev,
>                 return r;
>         }
>
> +       if (r && (queue_type != AMDGPU_RING_TYPE_COMPUTE)) {
> +               dev_err(adev->dev, "MES resetting queue type %d is not 
> supported\n",
> +                               queue_type);
> +               return r;
> +       }

I think the message here is a bit confusing.  The MES can reset other
queue types, this is just the fall back case for when MES queue reset
has failed.  Also, does MES populate the doorbell array for all queue
types regardless of whether the reset was successful or not?  If so,
shouldn't we bail for non-compute queues after the doorbells are
populated?

Alex

> +
>         *hung_db_num = 0;
>         /* MES passes hung queues' doorbell to driver */
>         for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
> @@ -486,13 +492,8 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
> amdgpu_device *adev,
>                 return r;
>         }
>
> -       /*
> -        * TODO: return HQD info for MES scheduled user compute queue reset 
> cases
> -        * stored in hung_db_array hqd info offset to full array size
> -        */
> -
> -       if (r)
> -               dev_err(adev->dev, "failed to reset\n");
> +       for (i = hqd_info_offset; i < hqd_info_offset + *hung_db_num; i++)
> +               hung_db_array[i] = db_array[i];
>
>         return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index f80e3aca9c78..2e6ae9f84db0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -170,6 +170,19 @@ struct amdgpu_mes {
>         uint64_t            
> shared_cmd_buf_gpu_addr[AMDGPU_MAX_MES_INST_PIPES];
>  };
>
> +struct amdgpu_mes_hung_queue_hqd_info {
> +       union {
> +               struct {
> +                       uint32_t queue_type: 3; // queue type
> +                       uint32_t pipe_index: 4; // pipe index
> +                       uint32_t queue_index: 8; // queue index
> +                       uint32_t reserved: 17;
> +               };
> +
> +               uint32_t bit0_31;
> +       };
> +};
> +
>  struct amdgpu_mes_gang {
>         int                             gang_id;
>         int                             priority;
> --
> 2.43.0
>

Reply via email to