ras: Reduce stack usage in amdgpu_virt_ras_get_cper_records()

Zhou1, Tao Mon, 08 Dec 2025 00:03:20 -0800

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <[email protected]>


> -----Original Message-----
> From: SHANMUGAM, SRINIVASAN <[email protected]>
> Sent: Friday, December 5, 2025 8:15 PM
> To: Koenig, Christian <[email protected]>; Deucher, Alexander
> <[email protected]>
> Cc: [email protected]; SHANMUGAM, SRINIVASAN
> <[email protected]>; Zhou1, Tao
> <[email protected]>; Zhang, Hawking <[email protected]>
> Subject: [PATCH] drm/amd/ras: Reduce stack usage in
> amdgpu_virt_ras_get_cper_records()
>
> amdgpu_virt_ras_get_cper_records() was using a large stack array of
> ras_log_info pointers. This contributed to the frame size warning on this
> function.
>
> Replace the fixed-size stack array:
>
>     struct ras_log_info *trace[MAX_RECORD_PER_BATCH];
>
> with a heap-allocated array using kcalloc().
>
> We free the trace buffer together with out_buf on all exit paths.
> If allocation of trace or out_buf fails, we return a generic RAS error code.
>
> This reduces stack usage and keeps the runtime behaviour unchanged.
>
> Fixes:
> stack frame size: 1112 bytes (limit: 1024)
>
> Cc: Tao Zhou <[email protected]>
> Cc: Hawking Zhang <[email protected]>
> Cc: Christian König <[email protected]>
> Cc: Alex Deucher <[email protected]>
> Signed-off-by: Srinivasan Shanmugam <[email protected]>
> ---
>  .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c   | 17 +++++++++++++----
>  1 file changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> index 5e90a187155b..a75479593864 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> @@ -183,7 +183,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>               (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
>       struct ras_log_batch_overview *overview = &virt_ras-
> >batch_mgr.batch_overview;
>       struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras-
> >batch_mgr.batch_trace;
> -     struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
> +     struct ras_log_info **trace;
>       uint32_t offset = 0, real_data_len = 0;
>       uint64_t batch_id;
>       uint8_t *out_buf;
> @@ -195,9 +195,15 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>       if (!req->buf_size || !req->buf_ptr || !req->cper_num)
>               return RAS_CMD__ERROR_INVALID_INPUT_DATA;
>
> +     trace = kcalloc(MAX_RECORD_PER_BATCH, sizeof(*trace),
> GFP_KERNEL);
> +     if (!trace)
> +             return RAS_CMD__ERROR_GENERIC;
> +
>       out_buf = kzalloc(req->buf_size, GFP_KERNEL);
> -     if (!out_buf)
> +     if (!out_buf) {
> +             kfree(trace);
>               return RAS_CMD__ERROR_GENERIC;
> +     }
>
>       memset(out_buf, 0, req->buf_size);
>
> @@ -205,8 +211,9 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>               batch_id = req->cper_start_id + i;
>               if (batch_id >= overview->last_batch_id)
>                       break;
> -             count = amdgpu_virt_ras_get_batch_records(ras_core,
> batch_id, trace,
> -                                     ARRAY_SIZE(trace), rsp_cache);
> +             count = amdgpu_virt_ras_get_batch_records(ras_core,
> batch_id,
> +                                                       trace,
> MAX_RECORD_PER_BATCH,
> +                                                       rsp_cache);
>               if (count > 0) {
>                       ret = ras_cper_generate_cper(ras_core, trace, count,
>                                       &out_buf[offset], req->buf_size -
> offset, &real_data_len); @@ -220,6 +227,7 @@ static int
> amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
>       if ((ret && (ret != -ENOMEM)) ||
>           copy_to_user(u64_to_user_ptr(req->buf_ptr), out_buf, offset)) {
>               kfree(out_buf);
> +             kfree(trace);
>               return RAS_CMD__ERROR_GENERIC;
>       }
>
> @@ -231,6 +239,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>       cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
>
>       kfree(out_buf);
> +     kfree(trace);
>
>       return RAS_CMD__SUCCESS;
>  }
> --
> 2.34.1

RE: [PATCH] drm/amd/ras: Reduce stack usage in amdgpu_virt_ras_get_cper_records()

Reply via email to