AMD General

The current patch is to fix a potential invalid pointer access issue.  The 
issues of simplification and naming consistency can be addressed in subsequent 
patches.


Best Regards,
Thomas
-----Original Message-----
From: Zhou1, Tao <[email protected]>
Sent: Tuesday, May 19, 2026 11:11 AM
To: Chai, Thomas <[email protected]>; [email protected]
Cc: Zhang, Hawking <[email protected]>; Yang, Stanley <[email protected]>
Subject: RE: [PATCH 6/7] drm/amd/ras: copy ras log data instead of referencing 
pointers

AMD General

[Tao] The buffer has three different names in this patch: trace, trace_arr, 
trace_arry, can we simplify it?

> -----Original Message-----
> From: Chai, Thomas <[email protected]>
> Sent: Monday, May 18, 2026 3:22 PM
> To: [email protected]
> Cc: Chai, Thomas <[email protected]>; Zhang, Hawking
> <[email protected]>; Zhou1, Tao <[email protected]>; Yang, Stanley
> <[email protected]>; Chai, Thomas <[email protected]>
> Subject: [PATCH 6/7] drm/amd/ras: copy ras log data instead of
> referencing pointers
>
> When generating ras cper file, the original data nodes in the ras log
> ring buffer may be deleted, leading to invalid pointer access. Copy
> the data from the ras log ring instead of directly referencing the pointers 
> to avoid this issue.
>
> Signed-off-by: YiPeng Chai <[email protected]>
> ---
>  .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 12 +++---
>  drivers/gpu/drm/amd/ras/rascore/ras_cmd.c     | 42 +++++++++++++------
>  drivers/gpu/drm/amd/ras/rascore/ras_cper.c    | 20 ++++-----
>  drivers/gpu/drm/amd/ras/rascore/ras_cper.h    |  2 +-
>  .../gpu/drm/amd/ras/rascore/ras_log_ring.c    | 23 +++++-----
>  .../gpu/drm/amd/ras/rascore/ras_log_ring.h    |  2 +-
>  6 files changed, 58 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> index b8e9442b2ca5..537f709d8570 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> @@ -219,7 +219,7 @@ static bool
> amdgpu_virt_ras_check_batch_cached(struct ras_cmd_batch_trace_record
> }
>
>  static int amdgpu_virt_ras_get_batch_records(struct ras_core_context
> *ras_core, uint64_t batch_id,
> -                     struct ras_log_info **trace_arr, uint32_t arr_num,
> +                     struct ras_log_info *trace_arr, uint32_t
> + arr_num,
>                       struct ras_cmd_batch_trace_record_rsp *rsp_cache)  {
>       struct ras_cmd_batch_trace_record_req req = { @@ -255,7 +255,8
> @@ static int amdgpu_virt_ras_get_batch_records(struct
> ras_core_context *ras_core,
>       }
>
>       for (i = 0; i < batch->trace_num && i < arr_num; i++)
> -             trace_arr[i] = &rsp->records[batch->offset + i];
> +             memcpy(&trace_arr[i],
> +                     &rsp->records[batch->offset + i],
> + sizeof(*trace_arr));
>
>       return i;
>  }
> @@ -272,7 +273,8 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>               (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
>       struct ras_log_batch_overview *overview = &virt_ras-
> >batch_mgr.batch_overview;
>       struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras-
> >batch_mgr.batch_trace;
> -     struct ras_log_info **trace;
> +     struct ras_log_info *trace;
> +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
>       uint32_t offset = 0, real_data_len = 0;
>       uint64_t batch_id;
>       uint8_t *out_buf;
> @@ -289,7 +291,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>           req->cper_num > RAS_CMD_MAX_CPER_FETCH_NUM)
>               return RAS_CMD__ERROR_INVALID_INPUT_DATA;
>
> -     trace = kcalloc(MAX_RECORD_PER_BATCH, sizeof(*trace),
> GFP_KERNEL);
> +     trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL);
>       if (!trace)
>               return RAS_CMD__ERROR_GENERIC;
>
> @@ -306,7 +308,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> ras_core_context *ras_core,
>               if (batch_id >= overview->last_batch_id)
>                       break;
>               count = amdgpu_virt_ras_get_batch_records(ras_core,
> batch_id,
> -                                                       trace,
> MAX_RECORD_PER_BATCH,
> +                                                       trace,
> + trace_count,
>                                                         rsp_cache);
>               if (count > 0) {
>                       ret = ras_cper_generate_cper(ras_core, trace,
> count, diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> index 5b7a36596b02..088b9b153f7f 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> @@ -202,11 +202,12 @@ static int ras_cmd_get_cper_records(struct
> ras_core_context *ras_core,
>                       (struct ras_cmd_cper_record_req *)cmd-
> >input_buff_raw;
>       struct ras_cmd_cper_record_rsp *rsp =
>                       (struct ras_cmd_cper_record_rsp *)cmd-
> >output_buff_raw;
> -     struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
> +     struct ras_log_info *trace = NULL;
> +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
>       struct ras_log_batch_overview overview;
>       uint32_t offset = 0, real_data_len = 0;
>       uint64_t batch_id;
> -     uint8_t *buffer;
> +     uint8_t *buffer = NULL;
>       int ret = 0, i, count;
>
>       if ((cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
> || @@ -224,6 +225,12 @@ static int ras_cmd_get_cper_records(struct
> ras_core_context *ras_core,
>       if (!buffer)
>               return RAS_CMD__ERROR_GENERIC;
>
> +     trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL);
> +     if (!trace) {
> +             ret = RAS_CMD__ERROR_GENERIC;
> +             goto out;
> +     }
> +
>       ras_log_ring_get_batch_overview(ras_core, &overview);
>       for (i = 0; i < req->cper_num; i++) {
>               batch_id = req->cper_start_id + i; @@ -231,7 +238,7 @@
> static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
>                       break;
>
>               count = ras_log_ring_get_batch_records(ras_core,
> batch_id, trace,
> -                                     ARRAY_SIZE(trace));
> +                                     trace_count);
>               if (count > 0) {
>                       ret = ras_cper_generate_cper(ras_core, trace, count,
>                                       &buffer[offset], req->buf_size -
> offset, &real_data_len); @@ -244,8 +251,8 @@ static int
> ras_cmd_get_cper_records(struct ras_core_context *ras_core,
>
>       if ((ret && (ret != -ENOMEM)) ||
>               copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer,
> offset)) {
> -             kfree(buffer);
> -             return RAS_CMD__ERROR_GENERIC;
> +             ret = RAS_CMD__ERROR_GENERIC;
> +             goto out;
>       }
>
>       rsp->real_data_size = offset;
> @@ -254,10 +261,12 @@ static int ras_cmd_get_cper_records(struct
> ras_core_context *ras_core,
>       rsp->version = 0;
>
>       cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
> +     ret = RAS_CMD__SUCCESS;
>
> +out:
> +     kfree(trace);
>       kfree(buffer);
> -
> -     return RAS_CMD__SUCCESS;
> +     return ret;
>  }
>
>  static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context
> *ras_core, @@ -291,7 +300,8 @@ static int
> ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
>       struct ras_cmd_batch_trace_record_rsp *output_data =
>                       (struct ras_cmd_batch_trace_record_rsp *)cmd-
> >output_buff_raw;
>       struct ras_log_batch_overview overview;
> -     struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
> +     struct ras_log_info *trace_arry = NULL;
> +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
>       struct ras_log_info *record;
>       int i, j, count = 0, offset = 0;
>       uint64_t id;
> @@ -309,6 +319,10 @@ static int ras_cmd_get_batch_trace_records(struct
> ras_core_context *ras_core,
>           (input_data->start_batch_id >= overview.last_batch_id))
>               return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
>
> +     trace_arry = kcalloc(trace_count, sizeof(*trace_arry), GFP_KERNEL);
> +     if (!trace_arry)
> +             return RAS_CMD__ERROR_GENERIC;
> +
>       for (i = 0; i < input_data->batch_num; i++) {
>               id = input_data->start_batch_id + i;
>               if (id >= overview.last_batch_id) { @@ -317,17 +331,17
> @@ static int ras_cmd_get_batch_trace_records(struct
> ras_core_context *ras_core,
>               }
>
>               count = ras_log_ring_get_batch_records(ras_core,
> -                                     id, trace_arry,
> ARRAY_SIZE(trace_arry));
> +                                     id, trace_arry, trace_count);
>               if (count > 0) {
>                       if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
>                               break;
>                       for (j = 0; j < count; j++) {
>                               record = &output_data->records[offset + j];
> -                             record->seqno = trace_arry[j]->seqno;
> -                             record->timestamp = trace_arry[j]-
> >timestamp;
> -                             record->event = trace_arry[j]->event;
> +                             record->seqno = trace_arry[j].seqno;
> +                             record->timestamp = trace_arry[j].timestamp;
> +                             record->event = trace_arry[j].event;
>                               memcpy(&record->aca_reg,
> -                                     &trace_arry[j]->aca_reg,
> sizeof(trace_arry[j]->aca_reg));
> +                                     &trace_arry[j].aca_reg,
> sizeof(trace_arry[j].aca_reg));
>                       }
>               } else {
>                       count = 0;
> @@ -346,6 +360,8 @@ static int ras_cmd_get_batch_trace_records(struct
> ras_core_context *ras_core,
>
>       cmd->output_size = sizeof(struct
> ras_cmd_batch_trace_record_rsp);
>
> +     kfree(trace_arry);
> +
>       return RAS_CMD__SUCCESS;
>  }
>
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> index 0fc7522b7ab6..6e93a13bbc4c 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> @@ -175,14 +175,14 @@ static int fill_section_runtime(struct
> ras_core_context *ras_core,  }
>
>  static int cper_generate_runtime_record(struct ras_core_context *ras_core,
> -     struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t
> arr_num,
> +     struct cper_section_hdr *hdr, struct ras_log_info *trace_arr,
> +uint32_t arr_num,
>               enum ras_cper_severity sev)  {
>       struct cper_section_descriptor *descriptor;
>       struct cper_section_runtime *runtime;
>       int i;
>
> -     fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev,
> trace_arr[0]);
> +     fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev,
> +&trace_arr[0]);
>       hdr->record_length =  RAS_HDR_LEN + ((RAS_SEC_DESC_LEN +
> RAS_NONSTD_SEC_LEN) * arr_num);
>       hdr->sec_cnt = arr_num;
>       for (i = 0; i < arr_num; i++) {
> @@ -194,21 +194,21 @@ static int cper_generate_runtime_record(struct
> ras_core_context *ras_core,
>               fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
>                       RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
>                       sizeof(struct cper_section_runtime));
> -             fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
> +             fill_section_runtime(ras_core, runtime, &trace_arr[i],
> + sev);
>       }
>
>       return 0;
>  }
>
>  static int cper_generate_fatal_record(struct ras_core_context *ras_core,
> -     uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
> +     uint8_t *buffer, struct ras_log_info *trace_arr, uint32_t
> + arr_num)
>  {
>       struct ras_cper_fatal_record record = {0};
>       int i = 0;
>
>       for (i = 0; i < arr_num; i++) {
>               fill_section_hdr(ras_core, &record.hdr,
> RAS_CPER_TYPE_FATAL,
> -                              RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
> +                              RAS_CPER_SEV_FATAL_UE, &trace_arr[i]);
>               record.hdr.record_length =  RAS_HDR_LEN +
> RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
>               record.hdr.sec_cnt = 1;
>
> @@ -216,7 +216,7 @@ static int cper_generate_fatal_record(struct
> ras_core_context *ras_core,
>                                       CRASHDUMP, offsetof(struct
> ras_cper_fatal_record, fatal),
>                                       sizeof(struct
> cper_section_fatal));
>
> -             fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
> +             fill_section_fatal(ras_core, &record.fatal,
> + &trace_arr[i]);
>
>               memcpy(buffer + (i * record.hdr.record_length),
>                               &record, record.hdr.record_length); @@ -
> 271,7 +271,7 @@ static enum ras_cper_type
> cper_ras_log_event_to_cper_type(enum ras_log_event eve  }
>
>  int ras_cper_generate_cper(struct ras_core_context *ras_core,
> -             struct ras_log_info **trace_list, uint32_t count,
> +             struct ras_log_info *trace_list, uint32_t count,
>               uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)  {
>       uint8_t *buffer = buf;
> @@ -281,14 +281,14 @@ int ras_cper_generate_cper(struct
> ras_core_context *ras_core,
>
>       /* All the batch traces share the same event */
>       record_size = cper_get_record_size(
> -                     cper_ras_log_event_to_cper_type(trace_list[0]-
> >event), count);
> +
>       cper_ras_log_event_to_cper_type(trace_list[0].event), count);
>
>       if ((record_size + saved_size) > buf_size)
>               return -ENOMEM;
>
>       hdr = (struct cper_section_hdr *)(buffer + saved_size);
>
> -     switch (trace_list[0]->event) {
> +     switch (trace_list[0].event) {
>       case RAS_LOG_EVENT_RMA:
>               cper_generate_runtime_record(ras_core, hdr, trace_list,
> count, RAS_CPER_SEV_RMA);
>               break;
> @@ -304,7 +304,7 @@ int ras_cper_generate_cper(struct ras_core_context
> *ras_core,
>               cper_generate_fatal_record(ras_core, buffer +
> saved_size, trace_list, count);
>               break;
>       default:
> -             RAS_DEV_WARN(ras_core->dev, "Unprocessed trace
> event: %d\n", trace_list[0]->event);
> +             RAS_DEV_WARN(ras_core->dev, "Unprocessed trace
> event: %d\n",
> +trace_list[0].event);
>               break;
>       }
>
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> index 076c1883c1ce..e4e3615ecc2e 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> @@ -299,6 +299,6 @@ struct ras_cper_fatal_record {  struct
> ras_core_context;  struct ras_log_info;  int
> ras_cper_generate_cper(struct ras_core_context *ras_core,
> -             struct ras_log_info **trace_list, uint32_t count,
> +             struct ras_log_info *trace_list, uint32_t count,
>               uint8_t *buf, uint32_t buf_len, uint32_t
> *real_data_len); #endif diff --git
> a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> index 0a838fdcb2f6..c2fca1a1e780 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> @@ -265,8 +265,8 @@ void ras_log_ring_add_log_event(struct
> ras_core_context *ras_core,
>       ras_log_ring_add_data(ras_core, log, batch_tag);  }
>
> -static struct ras_log_info *ras_log_ring_lookup_data(struct
> ras_core_context *ras_core,
> -                                     uint64_t idx)
> +static int ras_log_ring_lookup_data(struct ras_core_context *ras_core,
> +                                     uint64_t idx, struct
> +ras_log_info *log)
>  {
>       struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
>       unsigned long flags = 0;
> @@ -274,30 +274,27 @@ static struct ras_log_info
> *ras_log_ring_lookup_data(struct ras_core_context *ra
>
>       spin_lock_irqsave(&log_ring->spin_lock, flags);
>       data = radix_tree_lookup(&log_ring->ras_log_root, idx);
> +     if (data)
> +             memcpy(log, data, sizeof(*log));
>       spin_unlock_irqrestore(&log_ring->spin_lock, flags);
>
> -     return (struct ras_log_info *)data;
> +     return data ? 0 : -ENODATA;
>  }
>
>  int ras_log_ring_get_batch_records(struct ras_core_context *ras_core,
> uint64_t batch_id,
> -             struct ras_log_info **log_arr, uint32_t arr_num)
> +             struct ras_log_info *log_arr, uint32_t arr_num)
>  {
>       struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
>       uint32_t i, idx, count = 0;
> -     void *data;
>
> -     if ((batch_id >= log_ring->mono_upward_batch_id) ||
> +     if (!log_arr || !arr_num || (batch_id >=
> +log_ring->mono_upward_batch_id) ||
>               (batch_id < log_ring->last_del_batch_id))
>               return -EINVAL;
>
> -     for (i = 0; i < MAX_RECORD_PER_BATCH; i++) {
> +     for (i = 0; i < MAX_RECORD_PER_BATCH && i < arr_num; i++) {
>               idx = BATCH_IDX_TO_TREE_IDX(batch_id, i);
> -             data = ras_log_ring_lookup_data(ras_core, idx);
> -             if (data) {
> -                     log_arr[count++] = data;
> -                     if (count >= arr_num)
> -                             break;
> -             }
> +             if (!ras_log_ring_lookup_data(ras_core, idx, &log_arr[count]))
> +                     count++;
>       }
>
>       return count;
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> index 0ff6cc35678d..cb66beaa9f43 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> @@ -86,7 +86,7 @@ void ras_log_ring_add_log_event(struct
> ras_core_context *ras_core,
>               enum ras_log_event event, void *data, struct
> ras_log_batch_tag *tag);
>
>  int ras_log_ring_get_batch_records(struct ras_core_context *ras_core,
> uint64_t batch_idx,
> -             struct ras_log_info **log_arr, uint32_t arr_num);
> +             struct ras_log_info *log_arr, uint32_t arr_num);
>
>  int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
>               struct ras_log_batch_overview *overview);
> --
> 2.43.0


Reply via email to