AMD General

Got it. With my concern in patch #7 fixed, the series is:

Reviewed-by: Tao Zhou <[email protected]>

> -----Original Message-----
> From: Chai, Thomas <[email protected]>
> Sent: Tuesday, May 19, 2026 11:22 AM
> To: Zhou1, Tao <[email protected]>; [email protected]
> Cc: Zhang, Hawking <[email protected]>; Yang, Stanley
> <[email protected]>
> Subject: RE: [PATCH 6/7] drm/amd/ras: copy ras log data instead of
> referencing pointers
>
> AMD General
>
> The current patch is to fix a potential invalid pointer access issue.  The 
> issues
> of simplification and naming consistency can be addressed in subsequent
> patches.
>
>
> Best Regards,
> Thomas
> -----Original Message-----
> From: Zhou1, Tao <[email protected]>
> Sent: Tuesday, May 19, 2026 11:11 AM
> To: Chai, Thomas <[email protected]>; [email protected]
> Cc: Zhang, Hawking <[email protected]>; Yang, Stanley
> <[email protected]>
> Subject: RE: [PATCH 6/7] drm/amd/ras: copy ras log data instead of
> referencing pointers
>
> AMD General
>
> [Tao] The buffer has three different names in this patch: trace, trace_arr,
> trace_arry, can we simplify it?
>
> > -----Original Message-----
> > From: Chai, Thomas <[email protected]>
> > Sent: Monday, May 18, 2026 3:22 PM
> > To: [email protected]
> > Cc: Chai, Thomas <[email protected]>; Zhang, Hawking
> > <[email protected]>; Zhou1, Tao <[email protected]>; Yang,
> Stanley
> > <[email protected]>; Chai, Thomas <[email protected]>
> > Subject: [PATCH 6/7] drm/amd/ras: copy ras log data instead of
> > referencing pointers
> >
> > When generating ras cper file, the original data nodes in the ras log
> > ring buffer may be deleted, leading to invalid pointer access. Copy
> > the data from the ras log ring instead of directly referencing the pointers 
> > to
> avoid this issue.
> >
> > Signed-off-by: YiPeng Chai <[email protected]>
> > ---
> >  .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 12 +++---
> >  drivers/gpu/drm/amd/ras/rascore/ras_cmd.c     | 42 +++++++++++++------
> >  drivers/gpu/drm/amd/ras/rascore/ras_cper.c    | 20 ++++-----
> >  drivers/gpu/drm/amd/ras/rascore/ras_cper.h    |  2 +-
> >  .../gpu/drm/amd/ras/rascore/ras_log_ring.c    | 23 +++++-----
> >  .../gpu/drm/amd/ras/rascore/ras_log_ring.h    |  2 +-
> >  6 files changed, 58 insertions(+), 43 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> > index b8e9442b2ca5..537f709d8570 100644
> > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c
> > @@ -219,7 +219,7 @@ static bool
> > amdgpu_virt_ras_check_batch_cached(struct
> ras_cmd_batch_trace_record }
> >
> >  static int amdgpu_virt_ras_get_batch_records(struct ras_core_context
> > *ras_core, uint64_t batch_id,
> > -                     struct ras_log_info **trace_arr, uint32_t arr_num,
> > +                     struct ras_log_info *trace_arr, uint32_t
> > + arr_num,
> >                       struct ras_cmd_batch_trace_record_rsp *rsp_cache)  {
> >       struct ras_cmd_batch_trace_record_req req = { @@ -255,7 +255,8
> > @@ static int amdgpu_virt_ras_get_batch_records(struct
> > ras_core_context *ras_core,
> >       }
> >
> >       for (i = 0; i < batch->trace_num && i < arr_num; i++)
> > -             trace_arr[i] = &rsp->records[batch->offset + i];
> > +             memcpy(&trace_arr[i],
> > +                     &rsp->records[batch->offset + i],
> > + sizeof(*trace_arr));
> >
> >       return i;
> >  }
> > @@ -272,7 +273,8 @@ static int amdgpu_virt_ras_get_cper_records(struct
> > ras_core_context *ras_core,
> >               (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
> >       struct ras_log_batch_overview *overview = &virt_ras-
> > >batch_mgr.batch_overview;
> >       struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras-
> > >batch_mgr.batch_trace;
> > -     struct ras_log_info **trace;
> > +     struct ras_log_info *trace;
> > +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
> >       uint32_t offset = 0, real_data_len = 0;
> >       uint64_t batch_id;
> >       uint8_t *out_buf;
> > @@ -289,7 +291,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> > ras_core_context *ras_core,
> >           req->cper_num > RAS_CMD_MAX_CPER_FETCH_NUM)
> >               return RAS_CMD__ERROR_INVALID_INPUT_DATA;
> >
> > -     trace = kcalloc(MAX_RECORD_PER_BATCH, sizeof(*trace),
> > GFP_KERNEL);
> > +     trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL);
> >       if (!trace)
> >               return RAS_CMD__ERROR_GENERIC;
> >
> > @@ -306,7 +308,7 @@ static int amdgpu_virt_ras_get_cper_records(struct
> > ras_core_context *ras_core,
> >               if (batch_id >= overview->last_batch_id)
> >                       break;
> >               count = amdgpu_virt_ras_get_batch_records(ras_core,
> > batch_id,
> > -                                                       trace,
> > MAX_RECORD_PER_BATCH,
> > +                                                       trace,
> > + trace_count,
> >                                                         rsp_cache);
> >               if (count > 0) {
> >                       ret = ras_cper_generate_cper(ras_core, trace,
> > count, diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> > b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> > index 5b7a36596b02..088b9b153f7f 100644
> > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> > @@ -202,11 +202,12 @@ static int ras_cmd_get_cper_records(struct
> > ras_core_context *ras_core,
> >                       (struct ras_cmd_cper_record_req *)cmd-
> > >input_buff_raw;
> >       struct ras_cmd_cper_record_rsp *rsp =
> >                       (struct ras_cmd_cper_record_rsp *)cmd-
> > >output_buff_raw;
> > -     struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
> > +     struct ras_log_info *trace = NULL;
> > +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
> >       struct ras_log_batch_overview overview;
> >       uint32_t offset = 0, real_data_len = 0;
> >       uint64_t batch_id;
> > -     uint8_t *buffer;
> > +     uint8_t *buffer = NULL;
> >       int ret = 0, i, count;
> >
> >       if ((cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
> > || @@ -224,6 +225,12 @@ static int ras_cmd_get_cper_records(struct
> > ras_core_context *ras_core,
> >       if (!buffer)
> >               return RAS_CMD__ERROR_GENERIC;
> >
> > +     trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL);
> > +     if (!trace) {
> > +             ret = RAS_CMD__ERROR_GENERIC;
> > +             goto out;
> > +     }
> > +
> >       ras_log_ring_get_batch_overview(ras_core, &overview);
> >       for (i = 0; i < req->cper_num; i++) {
> >               batch_id = req->cper_start_id + i; @@ -231,7 +238,7 @@
> > static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
> >                       break;
> >
> >               count = ras_log_ring_get_batch_records(ras_core,
> > batch_id, trace,
> > -                                     ARRAY_SIZE(trace));
> > +                                     trace_count);
> >               if (count > 0) {
> >                       ret = ras_cper_generate_cper(ras_core, trace, count,
> >                                       &buffer[offset], req->buf_size -
> > offset, &real_data_len); @@ -244,8 +251,8 @@ static int
> > ras_cmd_get_cper_records(struct ras_core_context *ras_core,
> >
> >       if ((ret && (ret != -ENOMEM)) ||
> >               copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer,
> > offset)) {
> > -             kfree(buffer);
> > -             return RAS_CMD__ERROR_GENERIC;
> > +             ret = RAS_CMD__ERROR_GENERIC;
> > +             goto out;
> >       }
> >
> >       rsp->real_data_size = offset;
> > @@ -254,10 +261,12 @@ static int ras_cmd_get_cper_records(struct
> > ras_core_context *ras_core,
> >       rsp->version = 0;
> >
> >       cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
> > +     ret = RAS_CMD__SUCCESS;
> >
> > +out:
> > +     kfree(trace);
> >       kfree(buffer);
> > -
> > -     return RAS_CMD__SUCCESS;
> > +     return ret;
> >  }
> >
> >  static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context
> > *ras_core, @@ -291,7 +300,8 @@ static int
> > ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
> >       struct ras_cmd_batch_trace_record_rsp *output_data =
> >                       (struct ras_cmd_batch_trace_record_rsp *)cmd-
> > >output_buff_raw;
> >       struct ras_log_batch_overview overview;
> > -     struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
> > +     struct ras_log_info *trace_arry = NULL;
> > +     uint32_t trace_count = MAX_RECORD_PER_BATCH;
> >       struct ras_log_info *record;
> >       int i, j, count = 0, offset = 0;
> >       uint64_t id;
> > @@ -309,6 +319,10 @@ static int ras_cmd_get_batch_trace_records(struct
> > ras_core_context *ras_core,
> >           (input_data->start_batch_id >= overview.last_batch_id))
> >               return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
> >
> > +     trace_arry = kcalloc(trace_count, sizeof(*trace_arry), GFP_KERNEL);
> > +     if (!trace_arry)
> > +             return RAS_CMD__ERROR_GENERIC;
> > +
> >       for (i = 0; i < input_data->batch_num; i++) {
> >               id = input_data->start_batch_id + i;
> >               if (id >= overview.last_batch_id) { @@ -317,17 +331,17
> > @@ static int ras_cmd_get_batch_trace_records(struct
> > ras_core_context *ras_core,
> >               }
> >
> >               count = ras_log_ring_get_batch_records(ras_core,
> > -                                     id, trace_arry,
> > ARRAY_SIZE(trace_arry));
> > +                                     id, trace_arry, trace_count);
> >               if (count > 0) {
> >                       if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
> >                               break;
> >                       for (j = 0; j < count; j++) {
> >                               record = &output_data->records[offset + j];
> > -                             record->seqno = trace_arry[j]->seqno;
> > -                             record->timestamp = trace_arry[j]-
> > >timestamp;
> > -                             record->event = trace_arry[j]->event;
> > +                             record->seqno = trace_arry[j].seqno;
> > +                             record->timestamp = trace_arry[j].timestamp;
> > +                             record->event = trace_arry[j].event;
> >                               memcpy(&record->aca_reg,
> > -                                     &trace_arry[j]->aca_reg,
> > sizeof(trace_arry[j]->aca_reg));
> > +                                     &trace_arry[j].aca_reg,
> > sizeof(trace_arry[j].aca_reg));
> >                       }
> >               } else {
> >                       count = 0;
> > @@ -346,6 +360,8 @@ static int ras_cmd_get_batch_trace_records(struct
> > ras_core_context *ras_core,
> >
> >       cmd->output_size = sizeof(struct
> > ras_cmd_batch_trace_record_rsp);
> >
> > +     kfree(trace_arry);
> > +
> >       return RAS_CMD__SUCCESS;
> >  }
> >
> > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> > index 0fc7522b7ab6..6e93a13bbc4c 100644
> > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
> > @@ -175,14 +175,14 @@ static int fill_section_runtime(struct
> > ras_core_context *ras_core,  }
> >
> >  static int cper_generate_runtime_record(struct ras_core_context *ras_core,
> > -     struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, 
> > uint32_t
> > arr_num,
> > +     struct cper_section_hdr *hdr, struct ras_log_info *trace_arr,
> > +uint32_t arr_num,
> >               enum ras_cper_severity sev)  {
> >       struct cper_section_descriptor *descriptor;
> >       struct cper_section_runtime *runtime;
> >       int i;
> >
> > -     fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev,
> > trace_arr[0]);
> > +     fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev,
> > +&trace_arr[0]);
> >       hdr->record_length =  RAS_HDR_LEN + ((RAS_SEC_DESC_LEN +
> > RAS_NONSTD_SEC_LEN) * arr_num);
> >       hdr->sec_cnt = arr_num;
> >       for (i = 0; i < arr_num; i++) {
> > @@ -194,21 +194,21 @@ static int cper_generate_runtime_record(struct
> > ras_core_context *ras_core,
> >               fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
> >                       RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
> >                       sizeof(struct cper_section_runtime));
> > -             fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
> > +             fill_section_runtime(ras_core, runtime, &trace_arr[i],
> > + sev);
> >       }
> >
> >       return 0;
> >  }
> >
> >  static int cper_generate_fatal_record(struct ras_core_context *ras_core,
> > -     uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
> > +     uint8_t *buffer, struct ras_log_info *trace_arr, uint32_t
> > + arr_num)
> >  {
> >       struct ras_cper_fatal_record record = {0};
> >       int i = 0;
> >
> >       for (i = 0; i < arr_num; i++) {
> >               fill_section_hdr(ras_core, &record.hdr,
> > RAS_CPER_TYPE_FATAL,
> > -                              RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
> > +                              RAS_CPER_SEV_FATAL_UE, &trace_arr[i]);
> >               record.hdr.record_length =  RAS_HDR_LEN +
> > RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
> >               record.hdr.sec_cnt = 1;
> >
> > @@ -216,7 +216,7 @@ static int cper_generate_fatal_record(struct
> > ras_core_context *ras_core,
> >                                       CRASHDUMP, offsetof(struct
> > ras_cper_fatal_record, fatal),
> >                                       sizeof(struct
> > cper_section_fatal));
> >
> > -             fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
> > +             fill_section_fatal(ras_core, &record.fatal,
> > + &trace_arr[i]);
> >
> >               memcpy(buffer + (i * record.hdr.record_length),
> >                               &record, record.hdr.record_length); @@ -
> > 271,7 +271,7 @@ static enum ras_cper_type
> > cper_ras_log_event_to_cper_type(enum ras_log_event eve  }
> >
> >  int ras_cper_generate_cper(struct ras_core_context *ras_core,
> > -             struct ras_log_info **trace_list, uint32_t count,
> > +             struct ras_log_info *trace_list, uint32_t count,
> >               uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)  {
> >       uint8_t *buffer = buf;
> > @@ -281,14 +281,14 @@ int ras_cper_generate_cper(struct
> > ras_core_context *ras_core,
> >
> >       /* All the batch traces share the same event */
> >       record_size = cper_get_record_size(
> > -                     cper_ras_log_event_to_cper_type(trace_list[0]-
> > >event), count);
> > +
> >       cper_ras_log_event_to_cper_type(trace_list[0].event), count);
> >
> >       if ((record_size + saved_size) > buf_size)
> >               return -ENOMEM;
> >
> >       hdr = (struct cper_section_hdr *)(buffer + saved_size);
> >
> > -     switch (trace_list[0]->event) {
> > +     switch (trace_list[0].event) {
> >       case RAS_LOG_EVENT_RMA:
> >               cper_generate_runtime_record(ras_core, hdr, trace_list,
> > count, RAS_CPER_SEV_RMA);
> >               break;
> > @@ -304,7 +304,7 @@ int ras_cper_generate_cper(struct ras_core_context
> > *ras_core,
> >               cper_generate_fatal_record(ras_core, buffer +
> > saved_size, trace_list, count);
> >               break;
> >       default:
> > -             RAS_DEV_WARN(ras_core->dev, "Unprocessed trace
> > event: %d\n", trace_list[0]->event);
> > +             RAS_DEV_WARN(ras_core->dev, "Unprocessed trace
> > event: %d\n",
> > +trace_list[0].event);
> >               break;
> >       }
> >
> > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> > index 076c1883c1ce..e4e3615ecc2e 100644
> > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
> > @@ -299,6 +299,6 @@ struct ras_cper_fatal_record {  struct
> > ras_core_context;  struct ras_log_info;  int
> > ras_cper_generate_cper(struct ras_core_context *ras_core,
> > -             struct ras_log_info **trace_list, uint32_t count,
> > +             struct ras_log_info *trace_list, uint32_t count,
> >               uint8_t *buf, uint32_t buf_len, uint32_t
> > *real_data_len); #endif diff --git
> > a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> > index 0a838fdcb2f6..c2fca1a1e780 100644
> > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
> > @@ -265,8 +265,8 @@ void ras_log_ring_add_log_event(struct
> > ras_core_context *ras_core,
> >       ras_log_ring_add_data(ras_core, log, batch_tag);  }
> >
> > -static struct ras_log_info *ras_log_ring_lookup_data(struct
> > ras_core_context *ras_core,
> > -                                     uint64_t idx)
> > +static int ras_log_ring_lookup_data(struct ras_core_context *ras_core,
> > +                                     uint64_t idx, struct
> > +ras_log_info *log)
> >  {
> >       struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
> >       unsigned long flags = 0;
> > @@ -274,30 +274,27 @@ static struct ras_log_info
> > *ras_log_ring_lookup_data(struct ras_core_context *ra
> >
> >       spin_lock_irqsave(&log_ring->spin_lock, flags);
> >       data = radix_tree_lookup(&log_ring->ras_log_root, idx);
> > +     if (data)
> > +             memcpy(log, data, sizeof(*log));
> >       spin_unlock_irqrestore(&log_ring->spin_lock, flags);
> >
> > -     return (struct ras_log_info *)data;
> > +     return data ? 0 : -ENODATA;
> >  }
> >
> >  int ras_log_ring_get_batch_records(struct ras_core_context *ras_core,
> > uint64_t batch_id,
> > -             struct ras_log_info **log_arr, uint32_t arr_num)
> > +             struct ras_log_info *log_arr, uint32_t arr_num)
> >  {
> >       struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
> >       uint32_t i, idx, count = 0;
> > -     void *data;
> >
> > -     if ((batch_id >= log_ring->mono_upward_batch_id) ||
> > +     if (!log_arr || !arr_num || (batch_id >=
> > +log_ring->mono_upward_batch_id) ||
> >               (batch_id < log_ring->last_del_batch_id))
> >               return -EINVAL;
> >
> > -     for (i = 0; i < MAX_RECORD_PER_BATCH; i++) {
> > +     for (i = 0; i < MAX_RECORD_PER_BATCH && i < arr_num; i++) {
> >               idx = BATCH_IDX_TO_TREE_IDX(batch_id, i);
> > -             data = ras_log_ring_lookup_data(ras_core, idx);
> > -             if (data) {
> > -                     log_arr[count++] = data;
> > -                     if (count >= arr_num)
> > -                             break;
> > -             }
> > +             if (!ras_log_ring_lookup_data(ras_core, idx, &log_arr[count]))
> > +                     count++;
> >       }
> >
> >       return count;
> > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> > index 0ff6cc35678d..cb66beaa9f43 100644
> > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
> > @@ -86,7 +86,7 @@ void ras_log_ring_add_log_event(struct
> > ras_core_context *ras_core,
> >               enum ras_log_event event, void *data, struct
> > ras_log_batch_tag *tag);
> >
> >  int ras_log_ring_get_batch_records(struct ras_core_context *ras_core,
> > uint64_t batch_idx,
> > -             struct ras_log_info **log_arr, uint32_t arr_num);
> > +             struct ras_log_info *log_arr, uint32_t arr_num);
> >
> >  int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
> >               struct ras_log_batch_overview *overview);
> > --
> > 2.43.0
>
>

Reply via email to