[AMD Official Use Only - AMD Internal Distribution Only]
Hi, Lijo
Currently the feature is being developed, to avoid affecting functions
already in driver, we separate it from the other code, and it is disabled by
default, will replace the original code when it is ready. Thank you.
-----Original Message-----
From: Lazar, Lijo <[email protected]>
Sent: Friday, January 30, 2026 1:49 PM
To: Xie, Patrick <[email protected]>; [email protected]
Cc: Zhou1, Tao <[email protected]>; Chai, Thomas <[email protected]>; Wang,
Yang(Kevin) <[email protected]>
Subject: Re: [PATCH 06/14] drm/amd/ras: Add table reset func for pmfw eeprom
On 30-Jan-26 7:59 AM, Gangliang Xie wrote:
> add table reset func for pmfw eeprom, add smu eeprom control structure
>
> Signed-off-by: Gangliang Xie <[email protected]>
> ---
> drivers/gpu/drm/amd/ras/rascore/ras.h | 1 +
> drivers/gpu/drm/amd/ras/rascore/ras_cmd.c | 9 ++++--
> .../gpu/drm/amd/ras/rascore/ras_eeprom_fw.c | 29 +++++++++++++++++++
> .../gpu/drm/amd/ras/rascore/ras_eeprom_fw.h | 26 +++++++++++++++++
> 4 files changed, 63 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
> b/drivers/gpu/drm/amd/ras/rascore/ras.h
> index 6e223eff522c..ae10d853c565 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
> @@ -313,6 +313,7 @@ struct ras_core_context {
>
> bool ras_eeprom_supported;
> struct ras_eeprom_control ras_eeprom;
> + struct ras_fw_eeprom_control ras_fw_eeprom;
>
> struct ras_psp ras_psp;
> struct ras_umc ras_umc;
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> index 94e6d7420d94..4f89810d85a1 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
> @@ -146,8 +146,13 @@ static int ras_cmd_clear_bad_page_info(struct
> ras_core_context *ras_core,
> if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
> return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
>
> - if (ras_eeprom_reset_table(ras_core))
> - return RAS_CMD__ERROR_GENERIC;
> + if (ras_fw_eeprom_supported(ras_core)) {
> + if (ras_fw_eeprom_reset_table(ras_core))
> + return RAS_CMD__ERROR_GENERIC;
> + } else {
> + if (ras_eeprom_reset_table(ras_core))
> + return RAS_CMD__ERROR_GENERIC;
> + }
>
> if (ras_umc_clean_badpage_data(ras_core))
> return RAS_CMD__ERROR_GENERIC;
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
> index f880fc49477d..ae63e7394829 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
> @@ -161,3 +161,32 @@ int ras_fw_erase_ras_table(struct ras_core_context
> *ras_core,
> return sys_func->mp1_send_eeprom_msg(ras_core,
> RAS_SMU_EraseRasTable, 0, result);
> }
> +
> +int ras_fw_eeprom_reset_table(struct ras_core_context *ras_core) {
> + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
> + u32 erase_res = 0;
> + int res;
> +
> + mutex_lock(&control->ras_tbl_mutex);
> +
> + res = ras_fw_erase_ras_table(ras_core, &erase_res);
Except this call, everything else looks like a common logic. For ex:
num_recs, bitmap etc. looks like common for eeprom as a whole and not specific
to fw eeprom.
Thanks,
Lijo
> + if (res || erase_res) {
> + RAS_DEV_WARN(ras_core->dev, "RAS EEPROM reset failed, res:%d
> result:%d",
> + res,
> erase_res);
> + if (!res)
> + res = -EIO;
> + }
> +
> + control->ras_num_recs = 0;
> + control->bad_channel_bitmap = 0;
> + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
> + &control->ras_num_recs);
> + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
> + &control->bad_channel_bitmap);
> + control->update_channel_flag = false;
> +
> + mutex_unlock(&control->ras_tbl_mutex);
> +
> + return res;
> +}
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
> index 46f45e82a3f3..a1003db3c33b 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
> @@ -24,6 +24,31 @@
> #ifndef __RAS_EEPROM_FW_H__
> #define __RAS_EEPROM_FW_H__
>
> +struct ras_fw_eeprom_control {
> + uint32_t version;
> + /* record threshold */
> + int record_threshold_config;
> + uint32_t record_threshold_count;
> + bool update_channel_flag;
> +
> + /* Number of records in the table.
> + */
> + u32 ras_num_recs;
> +
> + /* Maximum possible number of records
> + * we could store, i.e. the maximum capacity
> + * of the table.
> + */
> + u32 ras_max_record_count;
> +
> + /* Protect table access via this mutex.
> + */
> + struct mutex ras_tbl_mutex;
> +
> + /* Record channel info which occurred bad pages
> + */
> + u32 bad_channel_bitmap;
> +};
>
> void ras_fw_init_feature_flags(struct ras_core_context *ras_core);
> bool ras_fw_eeprom_supported(struct ras_core_context *ras_core); @@
> -41,5 +66,6 @@ int ras_fw_get_badpage_ipid(struct ras_core_context *ras_core,
> uint16_t index, uint64_t *ipid);
> int ras_fw_erase_ras_table(struct ras_core_context *ras_core,
> uint32_t *result);
> +int ras_fw_eeprom_reset_table(struct ras_core_context *ras_core);
>
> #endif