AMD General Got it. With my concern in patch #7 fixed, the series is:
Reviewed-by: Tao Zhou <[email protected]> > -----Original Message----- > From: Chai, Thomas <[email protected]> > Sent: Tuesday, May 19, 2026 11:22 AM > To: Zhou1, Tao <[email protected]>; [email protected] > Cc: Zhang, Hawking <[email protected]>; Yang, Stanley > <[email protected]> > Subject: RE: [PATCH 6/7] drm/amd/ras: copy ras log data instead of > referencing pointers > > AMD General > > The current patch is to fix a potential invalid pointer access issue. The > issues > of simplification and naming consistency can be addressed in subsequent > patches. > > > Best Regards, > Thomas > -----Original Message----- > From: Zhou1, Tao <[email protected]> > Sent: Tuesday, May 19, 2026 11:11 AM > To: Chai, Thomas <[email protected]>; [email protected] > Cc: Zhang, Hawking <[email protected]>; Yang, Stanley > <[email protected]> > Subject: RE: [PATCH 6/7] drm/amd/ras: copy ras log data instead of > referencing pointers > > AMD General > > [Tao] The buffer has three different names in this patch: trace, trace_arr, > trace_arry, can we simplify it? > > > -----Original Message----- > > From: Chai, Thomas <[email protected]> > > Sent: Monday, May 18, 2026 3:22 PM > > To: [email protected] > > Cc: Chai, Thomas <[email protected]>; Zhang, Hawking > > <[email protected]>; Zhou1, Tao <[email protected]>; Yang, > Stanley > > <[email protected]>; Chai, Thomas <[email protected]> > > Subject: [PATCH 6/7] drm/amd/ras: copy ras log data instead of > > referencing pointers > > > > When generating ras cper file, the original data nodes in the ras log > > ring buffer may be deleted, leading to invalid pointer access. Copy > > the data from the ras log ring instead of directly referencing the pointers > > to > avoid this issue. > > > > Signed-off-by: YiPeng Chai <[email protected]> > > --- > > .../drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c | 12 +++--- > > drivers/gpu/drm/amd/ras/rascore/ras_cmd.c | 42 +++++++++++++------ > > drivers/gpu/drm/amd/ras/rascore/ras_cper.c | 20 ++++----- > > drivers/gpu/drm/amd/ras/rascore/ras_cper.h | 2 +- > > .../gpu/drm/amd/ras/rascore/ras_log_ring.c | 23 +++++----- > > .../gpu/drm/amd/ras/rascore/ras_log_ring.h | 2 +- > > 6 files changed, 58 insertions(+), 43 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > > index b8e9442b2ca5..537f709d8570 100644 > > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c > > @@ -219,7 +219,7 @@ static bool > > amdgpu_virt_ras_check_batch_cached(struct > ras_cmd_batch_trace_record } > > > > static int amdgpu_virt_ras_get_batch_records(struct ras_core_context > > *ras_core, uint64_t batch_id, > > - struct ras_log_info **trace_arr, uint32_t arr_num, > > + struct ras_log_info *trace_arr, uint32_t > > + arr_num, > > struct ras_cmd_batch_trace_record_rsp *rsp_cache) { > > struct ras_cmd_batch_trace_record_req req = { @@ -255,7 +255,8 > > @@ static int amdgpu_virt_ras_get_batch_records(struct > > ras_core_context *ras_core, > > } > > > > for (i = 0; i < batch->trace_num && i < arr_num; i++) > > - trace_arr[i] = &rsp->records[batch->offset + i]; > > + memcpy(&trace_arr[i], > > + &rsp->records[batch->offset + i], > > + sizeof(*trace_arr)); > > > > return i; > > } > > @@ -272,7 +273,8 @@ static int amdgpu_virt_ras_get_cper_records(struct > > ras_core_context *ras_core, > > (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw; > > struct ras_log_batch_overview *overview = &virt_ras- > > >batch_mgr.batch_overview; > > struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras- > > >batch_mgr.batch_trace; > > - struct ras_log_info **trace; > > + struct ras_log_info *trace; > > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > > uint32_t offset = 0, real_data_len = 0; > > uint64_t batch_id; > > uint8_t *out_buf; > > @@ -289,7 +291,7 @@ static int amdgpu_virt_ras_get_cper_records(struct > > ras_core_context *ras_core, > > req->cper_num > RAS_CMD_MAX_CPER_FETCH_NUM) > > return RAS_CMD__ERROR_INVALID_INPUT_DATA; > > > > - trace = kcalloc(MAX_RECORD_PER_BATCH, sizeof(*trace), > > GFP_KERNEL); > > + trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL); > > if (!trace) > > return RAS_CMD__ERROR_GENERIC; > > > > @@ -306,7 +308,7 @@ static int amdgpu_virt_ras_get_cper_records(struct > > ras_core_context *ras_core, > > if (batch_id >= overview->last_batch_id) > > break; > > count = amdgpu_virt_ras_get_batch_records(ras_core, > > batch_id, > > - trace, > > MAX_RECORD_PER_BATCH, > > + trace, > > + trace_count, > > rsp_cache); > > if (count > 0) { > > ret = ras_cper_generate_cper(ras_core, trace, > > count, diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > > b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > > index 5b7a36596b02..088b9b153f7f 100644 > > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c > > @@ -202,11 +202,12 @@ static int ras_cmd_get_cper_records(struct > > ras_core_context *ras_core, > > (struct ras_cmd_cper_record_req *)cmd- > > >input_buff_raw; > > struct ras_cmd_cper_record_rsp *rsp = > > (struct ras_cmd_cper_record_rsp *)cmd- > > >output_buff_raw; > > - struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0}; > > + struct ras_log_info *trace = NULL; > > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > > struct ras_log_batch_overview overview; > > uint32_t offset = 0, real_data_len = 0; > > uint64_t batch_id; > > - uint8_t *buffer; > > + uint8_t *buffer = NULL; > > int ret = 0, i, count; > > > > if ((cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) > > || @@ -224,6 +225,12 @@ static int ras_cmd_get_cper_records(struct > > ras_core_context *ras_core, > > if (!buffer) > > return RAS_CMD__ERROR_GENERIC; > > > > + trace = kcalloc(trace_count, sizeof(*trace), GFP_KERNEL); > > + if (!trace) { > > + ret = RAS_CMD__ERROR_GENERIC; > > + goto out; > > + } > > + > > ras_log_ring_get_batch_overview(ras_core, &overview); > > for (i = 0; i < req->cper_num; i++) { > > batch_id = req->cper_start_id + i; @@ -231,7 +238,7 @@ > > static int ras_cmd_get_cper_records(struct ras_core_context *ras_core, > > break; > > > > count = ras_log_ring_get_batch_records(ras_core, > > batch_id, trace, > > - ARRAY_SIZE(trace)); > > + trace_count); > > if (count > 0) { > > ret = ras_cper_generate_cper(ras_core, trace, count, > > &buffer[offset], req->buf_size - > > offset, &real_data_len); @@ -244,8 +251,8 @@ static int > > ras_cmd_get_cper_records(struct ras_core_context *ras_core, > > > > if ((ret && (ret != -ENOMEM)) || > > copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, > > offset)) { > > - kfree(buffer); > > - return RAS_CMD__ERROR_GENERIC; > > + ret = RAS_CMD__ERROR_GENERIC; > > + goto out; > > } > > > > rsp->real_data_size = offset; > > @@ -254,10 +261,12 @@ static int ras_cmd_get_cper_records(struct > > ras_core_context *ras_core, > > rsp->version = 0; > > > > cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp); > > + ret = RAS_CMD__SUCCESS; > > > > +out: > > + kfree(trace); > > kfree(buffer); > > - > > - return RAS_CMD__SUCCESS; > > + return ret; > > } > > > > static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context > > *ras_core, @@ -291,7 +300,8 @@ static int > > ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core, > > struct ras_cmd_batch_trace_record_rsp *output_data = > > (struct ras_cmd_batch_trace_record_rsp *)cmd- > > >output_buff_raw; > > struct ras_log_batch_overview overview; > > - struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0}; > > + struct ras_log_info *trace_arry = NULL; > > + uint32_t trace_count = MAX_RECORD_PER_BATCH; > > struct ras_log_info *record; > > int i, j, count = 0, offset = 0; > > uint64_t id; > > @@ -309,6 +319,10 @@ static int ras_cmd_get_batch_trace_records(struct > > ras_core_context *ras_core, > > (input_data->start_batch_id >= overview.last_batch_id)) > > return RAS_CMD__ERROR_INVALID_INPUT_SIZE; > > > > + trace_arry = kcalloc(trace_count, sizeof(*trace_arry), GFP_KERNEL); > > + if (!trace_arry) > > + return RAS_CMD__ERROR_GENERIC; > > + > > for (i = 0; i < input_data->batch_num; i++) { > > id = input_data->start_batch_id + i; > > if (id >= overview.last_batch_id) { @@ -317,17 +331,17 > > @@ static int ras_cmd_get_batch_trace_records(struct > > ras_core_context *ras_core, > > } > > > > count = ras_log_ring_get_batch_records(ras_core, > > - id, trace_arry, > > ARRAY_SIZE(trace_arry)); > > + id, trace_arry, trace_count); > > if (count > 0) { > > if ((offset + count) > RAS_CMD_MAX_TRACE_NUM) > > break; > > for (j = 0; j < count; j++) { > > record = &output_data->records[offset + j]; > > - record->seqno = trace_arry[j]->seqno; > > - record->timestamp = trace_arry[j]- > > >timestamp; > > - record->event = trace_arry[j]->event; > > + record->seqno = trace_arry[j].seqno; > > + record->timestamp = trace_arry[j].timestamp; > > + record->event = trace_arry[j].event; > > memcpy(&record->aca_reg, > > - &trace_arry[j]->aca_reg, > > sizeof(trace_arry[j]->aca_reg)); > > + &trace_arry[j].aca_reg, > > sizeof(trace_arry[j].aca_reg)); > > } > > } else { > > count = 0; > > @@ -346,6 +360,8 @@ static int ras_cmd_get_batch_trace_records(struct > > ras_core_context *ras_core, > > > > cmd->output_size = sizeof(struct > > ras_cmd_batch_trace_record_rsp); > > > > + kfree(trace_arry); > > + > > return RAS_CMD__SUCCESS; > > } > > > > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > > index 0fc7522b7ab6..6e93a13bbc4c 100644 > > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c > > @@ -175,14 +175,14 @@ static int fill_section_runtime(struct > > ras_core_context *ras_core, } > > > > static int cper_generate_runtime_record(struct ras_core_context *ras_core, > > - struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, > > uint32_t > > arr_num, > > + struct cper_section_hdr *hdr, struct ras_log_info *trace_arr, > > +uint32_t arr_num, > > enum ras_cper_severity sev) { > > struct cper_section_descriptor *descriptor; > > struct cper_section_runtime *runtime; > > int i; > > > > - fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, > > trace_arr[0]); > > + fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, > > +&trace_arr[0]); > > hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + > > RAS_NONSTD_SEC_LEN) * arr_num); > > hdr->sec_cnt = arr_num; > > for (i = 0; i < arr_num; i++) { > > @@ -194,21 +194,21 @@ static int cper_generate_runtime_record(struct > > ras_core_context *ras_core, > > fill_section_descriptor(ras_core, descriptor, sev, RUNTIME, > > RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), > > sizeof(struct cper_section_runtime)); > > - fill_section_runtime(ras_core, runtime, trace_arr[i], sev); > > + fill_section_runtime(ras_core, runtime, &trace_arr[i], > > + sev); > > } > > > > return 0; > > } > > > > static int cper_generate_fatal_record(struct ras_core_context *ras_core, > > - uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num) > > + uint8_t *buffer, struct ras_log_info *trace_arr, uint32_t > > + arr_num) > > { > > struct ras_cper_fatal_record record = {0}; > > int i = 0; > > > > for (i = 0; i < arr_num; i++) { > > fill_section_hdr(ras_core, &record.hdr, > > RAS_CPER_TYPE_FATAL, > > - RAS_CPER_SEV_FATAL_UE, trace_arr[i]); > > + RAS_CPER_SEV_FATAL_UE, &trace_arr[i]); > > record.hdr.record_length = RAS_HDR_LEN + > > RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN; > > record.hdr.sec_cnt = 1; > > > > @@ -216,7 +216,7 @@ static int cper_generate_fatal_record(struct > > ras_core_context *ras_core, > > CRASHDUMP, offsetof(struct > > ras_cper_fatal_record, fatal), > > sizeof(struct > > cper_section_fatal)); > > > > - fill_section_fatal(ras_core, &record.fatal, trace_arr[i]); > > + fill_section_fatal(ras_core, &record.fatal, > > + &trace_arr[i]); > > > > memcpy(buffer + (i * record.hdr.record_length), > > &record, record.hdr.record_length); @@ - > > 271,7 +271,7 @@ static enum ras_cper_type > > cper_ras_log_event_to_cper_type(enum ras_log_event eve } > > > > int ras_cper_generate_cper(struct ras_core_context *ras_core, > > - struct ras_log_info **trace_list, uint32_t count, > > + struct ras_log_info *trace_list, uint32_t count, > > uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len) { > > uint8_t *buffer = buf; > > @@ -281,14 +281,14 @@ int ras_cper_generate_cper(struct > > ras_core_context *ras_core, > > > > /* All the batch traces share the same event */ > > record_size = cper_get_record_size( > > - cper_ras_log_event_to_cper_type(trace_list[0]- > > >event), count); > > + > > cper_ras_log_event_to_cper_type(trace_list[0].event), count); > > > > if ((record_size + saved_size) > buf_size) > > return -ENOMEM; > > > > hdr = (struct cper_section_hdr *)(buffer + saved_size); > > > > - switch (trace_list[0]->event) { > > + switch (trace_list[0].event) { > > case RAS_LOG_EVENT_RMA: > > cper_generate_runtime_record(ras_core, hdr, trace_list, > > count, RAS_CPER_SEV_RMA); > > break; > > @@ -304,7 +304,7 @@ int ras_cper_generate_cper(struct ras_core_context > > *ras_core, > > cper_generate_fatal_record(ras_core, buffer + > > saved_size, trace_list, count); > > break; > > default: > > - RAS_DEV_WARN(ras_core->dev, "Unprocessed trace > > event: %d\n", trace_list[0]->event); > > + RAS_DEV_WARN(ras_core->dev, "Unprocessed trace > > event: %d\n", > > +trace_list[0].event); > > break; > > } > > > > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > > b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > > index 076c1883c1ce..e4e3615ecc2e 100644 > > --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h > > @@ -299,6 +299,6 @@ struct ras_cper_fatal_record { struct > > ras_core_context; struct ras_log_info; int > > ras_cper_generate_cper(struct ras_core_context *ras_core, > > - struct ras_log_info **trace_list, uint32_t count, > > + struct ras_log_info *trace_list, uint32_t count, > > uint8_t *buf, uint32_t buf_len, uint32_t > > *real_data_len); #endif diff --git > > a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > > index 0a838fdcb2f6..c2fca1a1e780 100644 > > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c > > @@ -265,8 +265,8 @@ void ras_log_ring_add_log_event(struct > > ras_core_context *ras_core, > > ras_log_ring_add_data(ras_core, log, batch_tag); } > > > > -static struct ras_log_info *ras_log_ring_lookup_data(struct > > ras_core_context *ras_core, > > - uint64_t idx) > > +static int ras_log_ring_lookup_data(struct ras_core_context *ras_core, > > + uint64_t idx, struct > > +ras_log_info *log) > > { > > struct ras_log_ring *log_ring = &ras_core->ras_log_ring; > > unsigned long flags = 0; > > @@ -274,30 +274,27 @@ static struct ras_log_info > > *ras_log_ring_lookup_data(struct ras_core_context *ra > > > > spin_lock_irqsave(&log_ring->spin_lock, flags); > > data = radix_tree_lookup(&log_ring->ras_log_root, idx); > > + if (data) > > + memcpy(log, data, sizeof(*log)); > > spin_unlock_irqrestore(&log_ring->spin_lock, flags); > > > > - return (struct ras_log_info *)data; > > + return data ? 0 : -ENODATA; > > } > > > > int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, > > uint64_t batch_id, > > - struct ras_log_info **log_arr, uint32_t arr_num) > > + struct ras_log_info *log_arr, uint32_t arr_num) > > { > > struct ras_log_ring *log_ring = &ras_core->ras_log_ring; > > uint32_t i, idx, count = 0; > > - void *data; > > > > - if ((batch_id >= log_ring->mono_upward_batch_id) || > > + if (!log_arr || !arr_num || (batch_id >= > > +log_ring->mono_upward_batch_id) || > > (batch_id < log_ring->last_del_batch_id)) > > return -EINVAL; > > > > - for (i = 0; i < MAX_RECORD_PER_BATCH; i++) { > > + for (i = 0; i < MAX_RECORD_PER_BATCH && i < arr_num; i++) { > > idx = BATCH_IDX_TO_TREE_IDX(batch_id, i); > > - data = ras_log_ring_lookup_data(ras_core, idx); > > - if (data) { > > - log_arr[count++] = data; > > - if (count >= arr_num) > > - break; > > - } > > + if (!ras_log_ring_lookup_data(ras_core, idx, &log_arr[count])) > > + count++; > > } > > > > return count; > > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > > b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > > index 0ff6cc35678d..cb66beaa9f43 100644 > > --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h > > @@ -86,7 +86,7 @@ void ras_log_ring_add_log_event(struct > > ras_core_context *ras_core, > > enum ras_log_event event, void *data, struct > > ras_log_batch_tag *tag); > > > > int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, > > uint64_t batch_idx, > > - struct ras_log_info **log_arr, uint32_t arr_num); > > + struct ras_log_info *log_arr, uint32_t arr_num); > > > > int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core, > > struct ras_log_batch_overview *overview); > > -- > > 2.43.0 > >
