amdgpu: add RAS bad page threshold handling

Xie, Patrick Wed, 29 Oct 2025 04:06:43 -0700

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of 
[email protected]
Sent: Wednesday, October 29, 2025 6:54 PM
To: [email protected]
Subject: amd-gfx Digest, Vol 113, Issue 529


Send amd-gfx mailing list submissions to
        [email protected]

To subscribe or unsubscribe via the World Wide Web, visit
        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
        [email protected]

You can reach the person managing the list at
        [email protected]

When replying, please edit your Subject line so it is more specific than "Re: 
Contents of amd-gfx digest..."


Today's Topics:

   1. [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
      for PMFW manages eeprom (Tao Zhou)
   2. [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
      address (Tao Zhou)
   3. Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
      copying firmware to it (Timur Krist?f)
   4. Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
      querying video codecs (Timur Krist?f)


----------------------------------------------------------------------

Message: 1
Date: Wed, 29 Oct 2025 18:38:02 +0800
From: Tao Zhou <[email protected]>
To: <[email protected]>
Cc: Tao Zhou <[email protected]>
Subject: [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
        for PMFW manages eeprom
Message-ID: <[email protected]>
Content-Type: text/plain

Check if bad page threshold is reached and take actions accordingly.

Signed-off-by: Tao Zhou <[email protected]>
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index de7b268a9862..0acf45d5fc54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -896,6 +896,36 @@ int amdgpu_ras_eeprom_update_record_num(struct 
amdgpu_ras_eeprom_control *contro
        return ret;
 }

+static int amdgpu_ras_smu_eeprom_append(struct
+amdgpu_ras_eeprom_control *control) {
+       struct amdgpu_device *adev = to_amdgpu_device(control);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!amdgpu_ras_smu_eeprom_supported(adev))
+               return 0;
+
+       control->ras_num_bad_pages = con->bad_page_num;
+
+       if (amdgpu_bad_page_threshold != 0 &&
+           control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
+               dev_warn(adev->dev,
+                       "Saved bad pages %d reaches threshold value %d\n",
+                       control->ras_num_bad_pages, 
con->bad_page_cnt_threshold);
+
+               if (adev->cper.enabled && 
amdgpu_cper_generate_bp_threshold_record(adev))
+                       dev_warn(adev->dev, "fail to generate bad page 
threshold cper
+records\n");
+
+               if ((amdgpu_bad_page_threshold != -1) &&
+                   (amdgpu_bad_page_threshold != -2))
+                       con->is_rma = true;
+
+               /* ignore the -ENOTSUPP return value */
+               amdgpu_dpm_send_rma_reason(adev);
Patrick:
        In pmfw managed eeprom, rma reason is not needed, so these two lines 
should be removed.

Best Regards,
+       }
+
+       return 0;
+}
+
 /**
  * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
  * @control: pointer to control structure @@ -914,17 +944,14 @@ int 
amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
                             const u32 num)
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        int res, i;
        uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;

        if (!__is_ras_eeprom_supported(adev))
                return 0;

-       if (amdgpu_ras_smu_eeprom_supported(adev)) {
-               control->ras_num_bad_pages = con->bad_page_num;
-               return 0;
-       }
+       if (amdgpu_ras_smu_eeprom_supported(adev))
+               return amdgpu_ras_smu_eeprom_append(control);

        if (num == 0) {
                dev_err(adev->dev, "will not append 0 records\n");
--
2.34.1



------------------------------

Message: 2
Date: Wed, 29 Oct 2025 18:38:00 +0800
From: Tao Zhou <[email protected]>
To: <[email protected]>
Cc: Tao Zhou <[email protected]>
Subject: [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
        address
Message-ID: <[email protected]>
Content-Type: text/plain

Instead of from physical address.

v2: add comment to make the code more readable

Signed-off-by: Tao Zhou <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c        | 15 ++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c |  4 ++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 23d421b8ba54..ad197486d9e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3010,8 +3010,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device 
*adev,
        addr_in.ma.err_addr = bps->address;
        addr_in.ma.socket_id = socket;
        addr_in.ma.ch_inst = bps->mem_channel;
-       /* tell RAS TA the node instance is not used */
-       addr_in.ma.node_inst = TA_RAS_INV_NODE;
+       if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+               /* tell RAS TA the node instance is not used */
+               addr_in.ma.node_inst = TA_RAS_INV_NODE;
+       } else {
+               addr_in.ma.umc_inst = bps->mcumc_id;
+               addr_in.ma.node_inst = bps->cu;
+       }

        if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
                ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
@@ -3158,7 +3163,11 @@ static int __amdgpu_ras_convert_rec_from_rom(struct 
amdgpu_device *adev,
                save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
                bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
        } else {
-               save_nps = nps;
+               /* if pmfw manages eeprom, save_nps is not stored on eeprom,
+                * we should always convert mca address into physical address,
+                * make save_nps different from nps
+                */
+               save_nps = nps + 1;
        }

        if (save_nps == nps) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 3bf633158fa2..511c5882b37e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1012,10 +1012,10 @@ int amdgpu_ras_eeprom_read_idx(struct 
amdgpu_ras_eeprom_control *control,
                record[i - rec_idx].retired_page = 0x1ULL;
                record[i - rec_idx].ts = ts;
                record[i - rec_idx].err_type = 
AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
-               record[i - rec_idx].cu = 0;

                if (adev->umc.ras->mca_ipid_parse)
-                       adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
+                       adev->umc.ras->mca_ipid_parse(adev, ipid,
+                               (uint32_t *)&(record[i - rec_idx].cu),
                                (uint32_t *)&(record[i - rec_idx].mem_channel),
                                (uint32_t *)&(record[i - rec_idx].mcumc_id), 
NULL);
                else
--
2.34.1



------------------------------

Message: 3
Date: Wed, 29 Oct 2025 11:48:41 +0100
From: Timur Krist?f <[email protected]>
To: Christian K?nig <[email protected]>,
        [email protected], Alex Deucher
        <[email protected]>,  Alexandre Demers
        <[email protected]>, Rodrigo Siqueira <[email protected]>
Subject: Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
        copying firmware to it
Message-ID: <[email protected]>
Content-Type: text/plain; charset="UTF-8"

On Wed, 2025-10-29 at 11:19 +0100, Christian K?nig wrote:
> On 10/28/25 23:06, Timur Krist?f wrote:
> > The VCPU BO doesn't only contain the VCE firmware but also other
> > ranges that the VCE uses for its stack and data. Let's initialize
> > this to zero to avoid having garbage in the VCPU BO.
>
> Absolutely clear NAK.
>
> This is intentionally not initialized on resume to avoid breaking
> encode sessions which existed before suspend.

How can there be encode sessions from before suspend?
I think that there can't be.

As far as I see, before suspend we wait for the VCE to go idle, meaning
that we wait for all pending work to finish.
amdgpu_vce_suspend has a comment which says:
suspending running encoding sessions isn't supported

> Why exactly is that an issue?

We need to clear at least some of the BO for the VCE1 firmware
validation mechanism. This is done in a memset in vce_v1_0_load_fw in
the old radeon driver.

Also I think it's a good idea to avoid having garbage in the VCPU BO.

> The VCE FW BO should be cleared to zero after initial allocation?

To clarify, are you suggesting that I move the memset to after the BO
creation, and then never clear it again? Or are you saying that
amdgpu_bo_create_reserved already clears the BO?

>
> Regards,
> Christian.
>
> >
> > Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)")
> > Signed-off-by: Timur Krist?f <[email protected]>
> > ---
> > ?drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 +
> > ?1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > index b9060bcd4806..eaa06dbef5c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > @@ -310,6 +310,7 @@ int amdgpu_vce_resume(struct amdgpu_device
> > *adev)
> > ?   offset = le32_to_cpu(hdr->ucode_array_offset_bytes);
> > ?
> > ?   if (drm_dev_enter(adev_to_drm(adev), &idx)) {
> > +           memset32(cpu_addr, 0, amdgpu_bo_size(adev-
> > >vce.vcpu_bo) / 4);
> > ?           memcpy_toio(cpu_addr, adev->vce.fw->data + offset,
> > ?                   ??? adev->vce.fw->size - offset);
> > ?           drm_dev_exit(idx);


------------------------------

Message: 4
Date: Wed, 29 Oct 2025 11:54:14 +0100
From: Timur Krist?f <[email protected]>
To: Christian K?nig <[email protected]>,
        [email protected], Alex Deucher
        <[email protected]>,  Alexandre Demers
        <[email protected]>, Rodrigo Siqueira <[email protected]>
Subject: Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
        querying video codecs
Message-ID: <[email protected]>
Content-Type: text/plain; charset="UTF-8"

On Wed, 2025-10-29 at 11:35 +0100, Christian K?nig wrote:
>
>
> On 10/28/25 23:06, Timur Krist?f wrote:
> > Some harvested chips may not have any IP blocks,
> > or we may not have the firmware for the IP blocks.
> > In these cases, the query should return that no video
> > codec is supported.
> >
> > Signed-off-by: Timur Krist?f <[email protected]>
> > ---
> > ?drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++-
> > ?drivers/gpu/drm/amd/amdgpu/cik.c??????? | 6 ++++++
> > ?drivers/gpu/drm/amd/amdgpu/si.c???????? | 6 ++++++
> > ?drivers/gpu/drm/amd/amdgpu/vi.c???????? | 6 ++++++
> > ?4 files changed, 20 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > index b3e6b3fcdf2c..42b5da59d00f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > @@ -1263,7 +1263,8 @@ int amdgpu_info_ioctl(struct drm_device *dev,
> > void *data, struct drm_file *filp)
> > ?                   -EFAULT : 0;
> > ?   }
> > ?   case AMDGPU_INFO_VIDEO_CAPS: {
> > -           const struct amdgpu_video_codecs *codecs;
> > +           static const struct amdgpu_video_codecs no_codecs
> > = {0};
>
> No zero init for static variables please, that will raise you a
> constant checker warning.
>
> > +           const struct amdgpu_video_codecs *codecs =
> > &no_codecs;
> > ?           struct drm_amdgpu_info_video_caps *caps;
> > ?           int r;
> > ?
> > diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
> > b/drivers/gpu/drm/amd/amdgpu/cik.c
> > index 9cd63b4177bf..b755238c2c3d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> > @@ -130,6 +130,12 @@ static const struct amdgpu_video_codecs
> > cik_video_codecs_decode =
> > ?static int cik_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ?                           ? const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > +   const enum amd_ip_block_type ip =
> > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > +           return 0;
>
> I'm wondering if returning EOPNOTSUPP is not more appropriate here
> than returning an empty cappability list.

I don't think so.

Returning EOPNOTSUPP would indicate that the operation of querying the
codec support is not supported, and not that the list of supported
codecs is empty.

>
> Anyway setting the codecs list to empty in the caller is rather bad
> coding style.

Sure, I'll come up with a better way to do this.

>
> Regards,
> Christian.
>
> > +
> > ?   switch (adev->asic_type) {
> > ?   case CHIP_BONAIRE:
> > ?   case CHIP_HAWAII:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
> > b/drivers/gpu/drm/amd/amdgpu/si.c
> > index e0f139de7991..9468c03bdb1b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/si.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> > @@ -1003,6 +1003,12 @@ static const struct amdgpu_video_codecs
> > hainan_video_codecs_decode =
> > ?static int si_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ?                            const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > +   const enum amd_ip_block_type ip =
> > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > +           return 0;
> > +
> > ?   switch (adev->asic_type) {
> > ?   case CHIP_VERDE:
> > ?   case CHIP_TAHITI:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> > b/drivers/gpu/drm/amd/amdgpu/vi.c
> > index a611a7345125..f0e4193cf722 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > @@ -256,6 +256,12 @@ static const struct amdgpu_video_codecs
> > cz_video_codecs_decode =
> > ?static int vi_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ?                            const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > +   const enum amd_ip_block_type ip =
> > +           encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > +   if (!amdgpu_device_ip_is_valid(adev, ip))
> > +           return 0;
> > +
> > ?   switch (adev->asic_type) {
> > ?   case CHIP_TOPAZ:
> > ?           if (encode)


------------------------------

Subject: Digest Footer

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


------------------------------

End of amd-gfx Digest, Vol 113, Issue 529
*****************************************

RE: : [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling

Reply via email to