On 30-Jan-26 7:59 AM, Gangliang Xie wrote:
add read func for pmfw eeprom, and adapt address converting
for bad pages loaded from pmfw eeprom
Signed-off-by: Tao Zhou <[email protected]>
Signed-off-by: Gangliang Xie <[email protected]>
---
drivers/gpu/drm/amd/ras/rascore/ras.h | 1 +
drivers/gpu/drm/amd/ras/rascore/ras_core.c | 5 +-
.../gpu/drm/amd/ras/rascore/ras_eeprom_fw.c | 70 +++++++++++++++++++
.../gpu/drm/amd/ras/rascore/ras_eeprom_fw.h | 5 ++
drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 27 +++++--
.../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 2 +-
6 files changed, 101 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
b/drivers/gpu/drm/amd/ras/rascore/ras.h
index ae10d853c565..05c7923e8f0f 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -241,6 +241,7 @@ struct ras_bank_ecc {
uint64_t status;
uint64_t ipid;
uint64_t addr;
+ uint64_t ts;
};
struct ras_bank_ecc_node {
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
index 1f2ce3749d43..fe188a5304d9 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -239,7 +239,10 @@ static int ras_core_eeprom_recovery(struct
ras_core_context *ras_core)
int count;
int ret;
- count = ras_eeprom_get_record_count(ras_core);
+ if (ras_fw_eeprom_supported(ras_core))
+ count = ras_fw_eeprom_get_record_count(ras_core);
As mentioned in earlier patches, suggestion is to keep just
ras_eeprom_get_record_count.
With ras_eeprom, you may decide the access mechanism and fork to
different paths. That looks cleaner and all common variables can be kept
inside ras_eeprom itself.
+ else
+ count = ras_eeprom_get_record_count(ras_core);
if (!count)
return 0;
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
index 580dd7b09d00..79494ad16ee5 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
@@ -259,3 +259,73 @@ int ras_fw_eeprom_append(struct ras_core_context *ras_core,
mutex_unlock(&control->ras_tbl_mutex);
return 0;
}
+
+int ras_fw_eeprom_read_idx(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record_umc,
+ struct ras_bank_ecc *ras_ecc,
+ u32 rec_idx, const u32 num)
+{
+ struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
+ int i, ret, end_idx;
+ u64 mca, ipid, ts;
+
+ if (!ras_core->ras_umc.ip_func ||
+ !ras_core->ras_umc.ip_func->mca_ipid_parse)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&control->ras_tbl_mutex);
+
+ end_idx = rec_idx + num;
+ for (i = rec_idx; i < end_idx; i++) {
+ ret = ras_fw_get_badpage_mca_addr(ras_core, i, &mca);
+ if (ret)
+ goto out;
+
+ ret = ras_fw_get_badpage_ipid(ras_core, i, &ipid);
+ if (ret)
+ goto out;
+
+ ret = ras_fw_get_timestamp(ras_core, i, &ts);
+ if (ret)
+ goto out;
+
+ if (record_umc) {
+ record_umc[i - rec_idx].address = mca;
+ /* retired_page (pa) is unused now */
+ record_umc[i - rec_idx].retired_row_pfn = 0x1ULL;
+ record_umc[i - rec_idx].ts = ts;
+ record_umc[i - rec_idx].err_type =
RAS_EEPROM_ERR_NON_RECOVERABLE;
+
+ ras_core->ras_umc.ip_func->mca_ipid_parse(ras_core,
ipid,
+ (uint32_t *)&(record_umc[i - rec_idx].cu),
+ (uint32_t *)&(record_umc[i -
rec_idx].mem_channel),
+ (uint32_t *)&(record_umc[i -
rec_idx].mcumc_id), NULL);
+
+ /* update bad channel bitmap */
+ if ((record_umc[i - rec_idx].mem_channel <
BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+ !(control->bad_channel_bitmap & (1 <<
record_umc[i - rec_idx].mem_channel))) {
+ control->bad_channel_bitmap |= 1 <<
record_umc[i - rec_idx].mem_channel;
+ control->update_channel_flag = true;
+ }
+ }
+
+ if (ras_ecc) {
+ ras_ecc[i - rec_idx].addr = mca;
+ ras_ecc[i - rec_idx].ipid = ipid;
+ ras_ecc[i - rec_idx].ts = ts;
+ }
+
+ }
+
+out:
+ mutex_unlock(&control->ras_tbl_mutex);
+ return ret;
+}
+
+uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core)
+{
+ if (!ras_core)
+ return 0;
+
+ return ras_core->ras_fw_eeprom.ras_num_recs;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
index b94d3c9703e3..353977a2371e 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
@@ -70,5 +70,10 @@ int ras_fw_eeprom_reset_table(struct ras_core_context
*ras_core);
bool ras_fw_eeprom_check_safety_watermark(struct ras_core_context *ras_core);
int ras_fw_eeprom_append(struct ras_core_context *ras_core,
struct eeprom_umc_record *record, const u32 num);
+int ras_fw_eeprom_read_idx(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record_umc,
+ struct ras_bank_ecc *ras_ecc,
+ u32 rec_idx, const u32 num);
+uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core);
#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
index fd427fd59ecf..eb5bb6df18f5 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -436,17 +436,27 @@ int ras_umc_load_bad_pages(struct ras_core_context
*ras_core)
uint32_t ras_num_recs;
int ret;
- ras_num_recs = ras_eeprom_get_record_count(ras_core);
- /* no bad page record, skip eeprom access */
- if (!ras_num_recs ||
- ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE)
- return 0;
+ if (ras_fw_eeprom_supported(ras_core)) {
+ ras_num_recs = ras_fw_eeprom_get_record_count(ras_core);
+ /* no bad page record, skip eeprom access */
+ if (!ras_num_recs ||
+ ras_core->ras_fw_eeprom.record_threshold_config ==
DISABLE_RETIRE_PAGE)
+ return 0;
+ } else {
+ ras_num_recs = ras_eeprom_get_record_count(ras_core);
+ if (!ras_num_recs ||
+ ras_core->ras_eeprom.record_threshold_config ==
DISABLE_RETIRE_PAGE)
This is an example where common variables/logic get repeated.
Thanks,
Lijo
+ return 0;
+ }
bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL);
if (!bps)
return -ENOMEM;
- ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
+ if (ras_fw_eeprom_supported(ras_core))
+ ret = ras_fw_eeprom_read_idx(ras_core, bps, 0, 0, ras_num_recs);
+ else
+ ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
if (ret) {
RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table
records!");
} else {
@@ -474,7 +484,10 @@ static int ras_umc_save_bad_pages(struct ras_core_context
*ras_core)
if (!data->bps)
return 0;
- eeprom_record_num = ras_eeprom_get_record_count(ras_core);
+ if (ras_fw_eeprom_supported(ras_core))
+ eeprom_record_num = ras_fw_eeprom_get_record_count(ras_core);
+ else
+ eeprom_record_num = ras_eeprom_get_record_count(ras_core);
mutex_lock(&ras_umc->umc_lock);
save_count = data->count - eeprom_record_num;
/* only new entries are saved */
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
index e2792b239bea..53dc59e4de0c 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
@@ -413,7 +413,7 @@ static int umc_v12_0_eeprom_record_to_nps_record(struct
ras_core_context *ras_co
uint64_t pa = 0;
int ret = 0;
- if (nps == EEPROM_RECORD_UMC_NPS_MODE(record)) {
+ if (nps == EEPROM_RECORD_UMC_NPS_MODE(record) &&
!ras_fw_eeprom_supported(ras_core)) {
record->cur_nps_retired_row_pfn =
EEPROM_RECORD_UMC_ADDR_PFN(record);
} else {
ret = convert_eeprom_record_to_nps_addr(ras_core,