On Tue, Mar 24, 2026 at 1:57 PM Amber Lin <[email protected]> wrote:
>
> When allocate the hung queues memory, we need to take the number of
> queues into account for the worst hang case.
>
> Suggested-by: Jonathan Kim <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>

Reviewed-by: Alex Deucher <[email protected]>


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 33 +++++++++++++++++++------
>  1 file changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 0d4c77c1b4b5..e639d6c329e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -103,7 +103,7 @@ static inline u32 amdgpu_mes_get_hqd_mask(u32 num_pipe,
>
>  int amdgpu_mes_init(struct amdgpu_device *adev)
>  {
> -       int i, r, num_pipes;
> +       int i, r, num_pipes, num_queues = 0;
>         u32 total_vmid_mask, reserved_vmid_mask;
>         int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
>         u32 gfx_hqd_mask = 
> amdgpu_mes_get_hqd_mask(adev->gfx.me.num_pipe_per_me,
> @@ -159,7 +159,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>                 adev->mes.compute_hqd_mask[i] = compute_hqd_mask;
>         }
>
> -       num_pipes = adev->sdma.num_instances;
> +       num_pipes = adev->sdma.num_inst_per_xcc;
>         if (num_pipes > AMDGPU_MES_MAX_SDMA_PIPES)
>                 dev_warn(adev->dev, "more SDMA pipes than supported by MES! 
> (%d vs %d)\n",
>                          num_pipes, AMDGPU_MES_MAX_SDMA_PIPES);
> @@ -216,8 +216,27 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
>         if (r)
>                 goto error_doorbell;
>
> +       if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) {
> +               /* When queue/pipe reset is done in MES instead of in the
> +                * driver, MES passes hung queues information to the driver in
> +                * hung_queue_hqd_info. Calculate required space to store this
> +                * information.
> +                */
> +               for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
> +                       num_queues += hweight32(adev->mes.gfx_hqd_mask[i]);
> +
> +               for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
> +                       num_queues += 
> hweight32(adev->mes.compute_hqd_mask[i]);
> +
> +               for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
> +                       num_queues += hweight32(adev->mes.sdma_hqd_mask[i]) * 
> num_xcc;
> +
> +               adev->mes.hung_queue_hqd_info_offset = num_queues;
> +               adev->mes.hung_queue_db_array_size = num_queues * 2;
> +       }
> +
>         if (adev->mes.hung_queue_db_array_size) {
> -               for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
> +               for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
>                         r = amdgpu_bo_create_kernel(adev,
>                                                     
> adev->mes.hung_queue_db_array_size * sizeof(u32),
>                                                     PAGE_SIZE,
> @@ -264,10 +283,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
>                               &adev->mes.event_log_cpu_addr);
>
>         for (i = 0; i < AMDGPU_MAX_MES_PIPES * num_xcc; i++) {
> -               
> amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
> -                                     
> &adev->mes.hung_queue_db_array_gpu_addr[i],
> -                                     
> &adev->mes.hung_queue_db_array_cpu_addr[i]);
> -
> +               if (adev->mes.hung_queue_db_array_gpu_obj[i])
> +                        
> amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj[i],
> +                                        
> &adev->mes.hung_queue_db_array_gpu_addr[i],
> +                                        
> &adev->mes.hung_queue_db_array_cpu_addr[i]);
>                 if (adev->mes.sch_ctx_ptr[i])
>                         amdgpu_device_wb_free(adev, 
> adev->mes.sch_ctx_offs[i]);
>                 if (adev->mes.query_status_fence_ptr[i])
> --
> 2.43.0
>

Reply via email to