amdgpu: add RAS bad page threshold handling

Zhou1, Tao Wed, 29 Oct 2025 04:42:42 -0700

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Xie, Patrick <[email protected]>
> Sent: Wednesday, October 29, 2025 7:07 PM
> To: [email protected]
> Cc: Zhou1, Tao <[email protected]>
> Subject: RE: : [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of amd-gfx-
> [email protected]
> Sent: Wednesday, October 29, 2025 6:54 PM
> To: [email protected]
> Subject: amd-gfx Digest, Vol 113, Issue 529
>
> Send amd-gfx mailing list submissions to
>         [email protected]
>
> To subscribe or unsubscribe via the World Wide Web, visit
>         https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> or, via email, send a message with subject or body 'help' to
>         [email protected]
>
> You can reach the person managing the list at
>         [email protected]
>
> When replying, please edit your Subject line so it is more specific than "Re: 
> Contents
> of amd-gfx digest..."
>
>
> Today's Topics:
>
>    1. [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
>       for PMFW manages eeprom (Tao Zhou)
>    2. [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
>       address (Tao Zhou)
>    3. Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
>       copying firmware to it (Timur Krist?f)
>    4. Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
>       querying video codecs (Timur Krist?f)
>
>
> ----------------------------------------------------------------------
>
> Message: 1
> Date: Wed, 29 Oct 2025 18:38:02 +0800
> From: Tao Zhou <[email protected]>
> To: <[email protected]>
> Cc: Tao Zhou <[email protected]>
> Subject: [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
>         for PMFW manages eeprom
> Message-ID: <[email protected]>
> Content-Type: text/plain
>
> Check if bad page threshold is reached and take actions accordingly.
>
> Signed-off-by: Tao Zhou <[email protected]>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 37 ++++++++++++++++---
>  1 file changed, 32 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index de7b268a9862..0acf45d5fc54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -896,6 +896,36 @@ int amdgpu_ras_eeprom_update_record_num(struct
> amdgpu_ras_eeprom_control *contro
>         return ret;
>  }
>
> +static int amdgpu_ras_smu_eeprom_append(struct
> +amdgpu_ras_eeprom_control *control) {
> +       struct amdgpu_device *adev = to_amdgpu_device(control);
> +       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> +       if (!amdgpu_ras_smu_eeprom_supported(adev))
> +               return 0;
> +
> +       control->ras_num_bad_pages = con->bad_page_num;
> +
> +       if (amdgpu_bad_page_threshold != 0 &&
> +           control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
> +               dev_warn(adev->dev,
> +                       "Saved bad pages %d reaches threshold value %d\n",
> +                       control->ras_num_bad_pages,
> + con->bad_page_cnt_threshold);
> +
> +               if (adev->cper.enabled &&
> amdgpu_cper_generate_bp_threshold_record(adev))
> +                       dev_warn(adev->dev, "fail to generate bad page
> +threshold cper records\n");
> +
> +               if ((amdgpu_bad_page_threshold != -1) &&
> +                   (amdgpu_bad_page_threshold != -2))
> +                       con->is_rma = true;
> +
> +               /* ignore the -ENOTSUPP return value */
> +               amdgpu_dpm_send_rma_reason(adev);
> Patrick:
>         In pmfw managed eeprom, rma reason is not needed, so these two lines 
> should
> be removed.
>
> Best Regards,


[Tao] thanks for the reminder, will update it.

> +       }
> +
> +       return 0;
> +}
> +
>  /**
>   * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
>   * @control: pointer to control structure @@ -914,17 +944,14 @@ int
> amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
>                              const u32 num)  {
>         struct amdgpu_device *adev = to_amdgpu_device(control);
> -       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>         int res, i;
>         uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
>
>         if (!__is_ras_eeprom_supported(adev))
>                 return 0;
>
> -       if (amdgpu_ras_smu_eeprom_supported(adev)) {
> -               control->ras_num_bad_pages = con->bad_page_num;
> -               return 0;
> -       }
> +       if (amdgpu_ras_smu_eeprom_supported(adev))
> +               return amdgpu_ras_smu_eeprom_append(control);
>
>         if (num == 0) {
>                 dev_err(adev->dev, "will not append 0 records\n");
> --
> 2.34.1
>
>
>
> ------------------------------
>
> Message: 2
> Date: Wed, 29 Oct 2025 18:38:00 +0800
> From: Tao Zhou <[email protected]>
> To: <[email protected]>
> Cc: Tao Zhou <[email protected]>
> Subject: [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
>         address
> Message-ID: <[email protected]>
> Content-Type: text/plain
>
> Instead of from physical address.
>
> v2: add comment to make the code more readable
>
> Signed-off-by: Tao Zhou <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c        | 15 ++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c |  4 ++--
>  2 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 23d421b8ba54..ad197486d9e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3010,8 +3010,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct
> amdgpu_device *adev,
>         addr_in.ma.err_addr = bps->address;
>         addr_in.ma.socket_id = socket;
>         addr_in.ma.ch_inst = bps->mem_channel;
> -       /* tell RAS TA the node instance is not used */
> -       addr_in.ma.node_inst = TA_RAS_INV_NODE;
> +       if (!amdgpu_ras_smu_eeprom_supported(adev)) {
> +               /* tell RAS TA the node instance is not used */
> +               addr_in.ma.node_inst = TA_RAS_INV_NODE;
> +       } else {
> +               addr_in.ma.umc_inst = bps->mcumc_id;
> +               addr_in.ma.node_inst = bps->cu;
> +       }
>
>         if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
>                 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, @@ 
> -3158,7
> +3163,11 @@ static int __amdgpu_ras_convert_rec_from_rom(struct
> amdgpu_device *adev,
>                 save_nps = (bps->retired_page >> UMC_NPS_SHIFT) &
> UMC_NPS_MASK;
>                 bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
>         } else {
> -               save_nps = nps;
> +               /* if pmfw manages eeprom, save_nps is not stored on eeprom,
> +                * we should always convert mca address into physical address,
> +                * make save_nps different from nps
> +                */
> +               save_nps = nps + 1;
>         }
>
>         if (save_nps == nps) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 3bf633158fa2..511c5882b37e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1012,10 +1012,10 @@ int amdgpu_ras_eeprom_read_idx(struct
> amdgpu_ras_eeprom_control *control,
>                 record[i - rec_idx].retired_page = 0x1ULL;
>                 record[i - rec_idx].ts = ts;
>                 record[i - rec_idx].err_type =
> AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
> -               record[i - rec_idx].cu = 0;
>
>                 if (adev->umc.ras->mca_ipid_parse)
> -                       adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
> +                       adev->umc.ras->mca_ipid_parse(adev, ipid,
> +                               (uint32_t *)&(record[i - rec_idx].cu),
>                                 (uint32_t *)&(record[i - 
> rec_idx].mem_channel),
>                                 (uint32_t *)&(record[i - rec_idx].mcumc_id), 
> NULL);
>                 else
> --
> 2.34.1
>
>
>
> ------------------------------
>
> Message: 3
> Date: Wed, 29 Oct 2025 11:48:41 +0100
> From: Timur Krist?f <[email protected]>
> To: Christian K?nig <[email protected]>,
>         [email protected], Alex Deucher
>         <[email protected]>,  Alexandre Demers
>         <[email protected]>, Rodrigo Siqueira <[email protected]>
> Subject: Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
>         copying firmware to it
> Message-ID: <[email protected]>
> Content-Type: text/plain; charset="UTF-8"
>
> On Wed, 2025-10-29 at 11:19 +0100, Christian K?nig wrote:
> > On 10/28/25 23:06, Timur Krist?f wrote:
> > > The VCPU BO doesn't only contain the VCE firmware but also other
> > > ranges that the VCE uses for its stack and data. Let's initialize
> > > this to zero to avoid having garbage in the VCPU BO.
> >
> > Absolutely clear NAK.
> >
> > This is intentionally not initialized on resume to avoid breaking
> > encode sessions which existed before suspend.
>
> How can there be encode sessions from before suspend?
> I think that there can't be.
>
> As far as I see, before suspend we wait for the VCE to go idle, meaning that 
> we wait
> for all pending work to finish.
> amdgpu_vce_suspend has a comment which says:
> suspending running encoding sessions isn't supported
>
> > Why exactly is that an issue?
>
> We need to clear at least some of the BO for the VCE1 firmware validation
> mechanism. This is done in a memset in vce_v1_0_load_fw in the old radeon 
> driver.
>
> Also I think it's a good idea to avoid having garbage in the VCPU BO.
>
> > The VCE FW BO should be cleared to zero after initial allocation?
>
> To clarify, are you suggesting that I move the memset to after the BO 
> creation, and
> then never clear it again? Or are you saying that amdgpu_bo_create_reserved
> already clears the BO?
>
> >
> > Regards,
> > Christian.
> >
> > >
> > > Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)")
> > > Signed-off-by: Timur Krist?f <[email protected]>
> > > ---
> > > ?drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 +
> > > ?1 file changed, 1 insertion(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > index b9060bcd4806..eaa06dbef5c4 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > @@ -310,6 +310,7 @@ int amdgpu_vce_resume(struct amdgpu_device
> > > *adev)
> > > ?   offset = le32_to_cpu(hdr->ucode_array_offset_bytes);
> > > ?
> > > ?   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
> > > +           memset32(cpu_addr, 0, amdgpu_bo_size(adev-
> > > >vce.vcpu_bo) / 4);
> > > ?           memcpy_toio(cpu_addr, adev->vce.fw->data + offset,
> > > ?                   ??? adev->vce.fw->size - offset);
> > > ?           drm_dev_exit(idx);
>
>
> ------------------------------
>
> Message: 4
> Date: Wed, 29 Oct 2025 11:54:14 +0100
> From: Timur Krist?f <[email protected]>
> To: Christian K?nig <[email protected]>,
>         [email protected], Alex Deucher
>         <[email protected]>,  Alexandre Demers
>         <[email protected]>, Rodrigo Siqueira <[email protected]>
> Subject: Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
>         querying video codecs
> Message-ID: <[email protected]>
> Content-Type: text/plain; charset="UTF-8"
>
> On Wed, 2025-10-29 at 11:35 +0100, Christian K?nig wrote:
> >
> >
> > On 10/28/25 23:06, Timur Krist?f wrote:
> > > Some harvested chips may not have any IP blocks, or we may not have
> > > the firmware for the IP blocks.
> > > In these cases, the query should return that no video codec is
> > > supported.
> > >
> > > Signed-off-by: Timur Krist?f <[email protected]>
> > > ---
> > > ?drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++-
> > > ?drivers/gpu/drm/amd/amdgpu/cik.c??????? | 6 ++++++
> > > ?drivers/gpu/drm/amd/amdgpu/si.c???????? | 6 ++++++
> > > ?drivers/gpu/drm/amd/amdgpu/vi.c???????? | 6 ++++++
> > > ?4 files changed, 20 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > index b3e6b3fcdf2c..42b5da59d00f 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > @@ -1263,7 +1263,8 @@ int amdgpu_info_ioctl(struct drm_device *dev,
> > > void *data, struct drm_file *filp)
> > > ?                   -EFAULT : 0;
> > > ?   }
> > > ?   case AMDGPU_INFO_VIDEO_CAPS: {
> > > -           const struct amdgpu_video_codecs *codecs;
> > > +           static const struct amdgpu_video_codecs no_codecs
> > > = {0};
> >
> > No zero init for static variables please, that will raise you a
> > constant checker warning.
> >
> > > +           const struct amdgpu_video_codecs *codecs =
> > > &no_codecs;
> > > ?           struct drm_amdgpu_info_video_caps *caps;
> > > ?           int r;
> > > ?
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
> > > b/drivers/gpu/drm/amd/amdgpu/cik.c
> > > index 9cd63b4177bf..b755238c2c3d 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> > > @@ -130,6 +130,12 @@ static const struct amdgpu_video_codecs
> > > cik_video_codecs_decode = ?static int cik_query_video_codecs(struct
> > > amdgpu_device *adev, bool encode,
> > > ?                           ? const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > +   const enum amd_ip_block_type ip =
> > > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > > +           return 0;
> >
> > I'm wondering if returning EOPNOTSUPP is not more appropriate here
> > than returning an empty cappability list.
>
> I don't think so.
>
> Returning EOPNOTSUPP would indicate that the operation of querying the codec
> support is not supported, and not that the list of supported codecs is empty.
>
> >
> > Anyway setting the codecs list to empty in the caller is rather bad
> > coding style.
>
> Sure, I'll come up with a better way to do this.
>
> >
> > Regards,
> > Christian.
> >
> > > +
> > > ?   switch (adev->asic_type) {
> > > ?   case CHIP_BONAIRE:
> > > ?   case CHIP_HAWAII:
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
> > > b/drivers/gpu/drm/amd/amdgpu/si.c index e0f139de7991..9468c03bdb1b
> > > 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/si.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> > > @@ -1003,6 +1003,12 @@ static const struct amdgpu_video_codecs
> > > hainan_video_codecs_decode = ?static int
> > > si_query_video_codecs(struct amdgpu_device *adev, bool encode,
> > > ?                            const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > +   const enum amd_ip_block_type ip =
> > > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > > +           return 0;
> > > +
> > > ?   switch (adev->asic_type) {
> > > ?   case CHIP_VERDE:
> > > ?   case CHIP_TAHITI:
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> > > b/drivers/gpu/drm/amd/amdgpu/vi.c index a611a7345125..f0e4193cf722
> > > 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > > @@ -256,6 +256,12 @@ static const struct amdgpu_video_codecs
> > > cz_video_codecs_decode = ?static int vi_query_video_codecs(struct
> > > amdgpu_device *adev, bool encode,
> > > ?                            const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > +   const enum amd_ip_block_type ip =
> > > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > > +           return 0;
> > > +
> > > ?   switch (adev->asic_type) {
> > > ?   case CHIP_TOPAZ:
> > > ?           if (encode)
>
>
> ------------------------------
>
> Subject: Digest Footer
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
> ------------------------------
>
> End of amd-gfx Digest, Vol 113, Issue 529
> *****************************************

RE: : [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling

Reply via email to