amdgpu: add ras_eeprom_read_idx interface

Zhou1, Tao Tue, 04 Nov 2025 22:47:21 -0800

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Wang, Yang(Kevin) <[email protected]>
> Sent: Wednesday, November 5, 2025 11:18 AM
> To: Zhou1, Tao <[email protected]>; [email protected]
> Cc: Zhou1, Tao <[email protected]>
> Subject: RE: [PATCH 2/8] drm/amdgpu: add ras_eeprom_read_idx interface
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> +               if (adev->umc.ras->mca_ipid_parse)
> +                       adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
> +                               (uint32_t *)&(record[i - 
> rec_idx].mem_channel),
> +                               (uint32_t *)&(record[i - rec_idx].mcumc_id), 
> NULL);
> +               else
> +                       return -EOPNOTSUPP;
>
>
> It is better to remove the null pointer check from the loop and perform this 
> check in
> the early stages to avoid unnecessary operations With that fixed, the patch is
>
> Reviewed-by: Yang Wang <[email protected]>
>
> Best Regards,
> Kevin


[Tao] thanks for the suggestion, will update it.

>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Tao Zhou
> Sent: Wednesday, November 5, 2025 10:05
> To: [email protected]
> Cc: Zhou1, Tao <[email protected]>
> Subject: [PATCH 2/8] drm/amdgpu: add ras_eeprom_read_idx interface
>
> PMFW will manage RAS eeprom data by itself, add new interface to read eeprom
> data via PMFW, we can read part of records by setting index.
>
> v2: use IPID parse interface.
>     pa is not used and set it to a fixed value.
>
> Signed-off-by: Tao Zhou <[email protected]>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 51 +++++++++++++++++++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  4 ++
>  2 files changed, 55 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index d7e2a81bc274..47f292557a0c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -970,6 +970,50 @@ static int __amdgpu_ras_eeprom_read(struct
> amdgpu_ras_eeprom_control *control,
>         return res;
>  }
>
> +int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
> +                       struct eeprom_table_record *record, u32 rec_idx,
> +                       const u32 num)
> +{
> +       struct amdgpu_device *adev = to_amdgpu_device(control);
> +       uint64_t ts, end_idx;
> +       int i, ret;
> +       u64 mca, ipid;
> +
> +       if (!amdgpu_ras_smu_eeprom_supported(adev))
> +               return 0;
> +
> +       end_idx = rec_idx + num;
> +       for (i = rec_idx; i < end_idx; i++) {
> +               ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca);
> +               if (ret)
> +                       return ret;
> +
> +               ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid);
> +               if (ret)
> +                       return ret;
> +
> +               ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts);
> +               if (ret)
> +                       return ret;
> +
> +               record[i - rec_idx].address = mca;
> +               /* retired_page (pa) is unused now */
> +               record[i - rec_idx].retired_page = 0x1ULL;
> +               record[i - rec_idx].ts = ts;
> +               record[i - rec_idx].err_type =
> AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
> +               record[i - rec_idx].cu = 0;
> +
> +               if (adev->umc.ras->mca_ipid_parse)
> +                       adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
> +                               (uint32_t *)&(record[i - 
> rec_idx].mem_channel),
> +                               (uint32_t *)&(record[i - rec_idx].mcumc_id), 
> NULL);
> +               else
> +                       return -EOPNOTSUPP;
> +       }
> +
> +       return 0;
> +}
> +
>  /**
>   * amdgpu_ras_eeprom_read -- read EEPROM
>   * @control: pointer to control structure @@ -991,6 +1035,9 @@ int
> amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
>         u8 *buf, *pp;
>         u32 g0, g1;
>
> +       if (amdgpu_ras_smu_eeprom_supported(adev))
> +               return amdgpu_ras_eeprom_read_idx(control, record, 0,
> + num);
> +
>         if (!__is_ras_eeprom_supported(adev))
>                 return 0;
>
> @@ -1162,6 +1209,10 @@ static ssize_t amdgpu_ras_debugfs_table_read(struct
> file *f, char __user *buf,
>         int res = -EFAULT;
>         size_t data_len;
>
> +       /* pmfw manages eeprom data by itself */
> +       if (amdgpu_ras_smu_eeprom_supported(adev))
> +               return 0;
> +
>         mutex_lock(&control->ras_tbl_mutex);
>
>         /* We want *pos - data_len > 0, which means there's diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index cfbd402ddea2..e881007f715b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -186,6 +186,10 @@ int amdgpu_ras_smu_get_badpage_ipid(struct
> amdgpu_device *adev,  int amdgpu_ras_smu_erase_ras_table(struct
> amdgpu_device *adev,
>                                                                         
> uint32_t *result);
>
> +int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
> +                       struct eeprom_table_record *record, u32 rec_idx,
> +                       const u32 num);
> +
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
>  extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>
> --
> 2.34.1
>

RE: [PATCH 2/8] drm/amdgpu: add ras_eeprom_read_idx interface

Reply via email to