[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang <[email protected]>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of Tao Zhou
Sent: Wednesday, November 5, 2025 10:05
To: [email protected]
Cc: Zhou1, Tao <[email protected]>
Subject: [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling for PMFW 
manages eeprom

Check if bad page threshold is reached and take actions accordingly.

v2: remove rma message sent to smu when pmfw manages eeprom.
v3: add null pointer check for con.

Signed-off-by: Tao Zhou <[email protected]>
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index beb504cb4bfc..5768d5454a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -903,6 +903,33 @@ int amdgpu_ras_eeprom_update_record_num(struct 
amdgpu_ras_eeprom_control *contro
        return ret;
 }

+static int amdgpu_ras_smu_eeprom_append(struct
+amdgpu_ras_eeprom_control *control) {
+       struct amdgpu_device *adev = to_amdgpu_device(control);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!amdgpu_ras_smu_eeprom_supported(adev) || !con)
+               return 0;
+
+       control->ras_num_bad_pages = con->bad_page_num;
+
+       if (amdgpu_bad_page_threshold != 0 &&
+           control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
+               dev_warn(adev->dev,
+                       "Saved bad pages %d reaches threshold value %d\n",
+                       control->ras_num_bad_pages, 
con->bad_page_cnt_threshold);
+
+               if (adev->cper.enabled && 
amdgpu_cper_generate_bp_threshold_record(adev))
+                       dev_warn(adev->dev, "fail to generate bad page 
threshold cper
+records\n");
+
+               if ((amdgpu_bad_page_threshold != -1) &&
+                   (amdgpu_bad_page_threshold != -2))
+                       con->is_rma = true;
+       }
+
+       return 0;
+}
+
 /**
  * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
  * @control: pointer to control structure @@ -921,17 +948,14 @@ int 
amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
                             const u32 num)
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        int res, i;
        uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;

-       if (!__is_ras_eeprom_supported(adev) || !con)
+       if (!__is_ras_eeprom_supported(adev))
                return 0;

-       if (amdgpu_ras_smu_eeprom_supported(adev)) {
-               control->ras_num_bad_pages = con->bad_page_num;
-               return 0;
-       }
+       if (amdgpu_ras_smu_eeprom_supported(adev))
+               return amdgpu_ras_smu_eeprom_append(control);

        if (num == 0) {
                dev_err(adev->dev, "will not append 0 records\n");
--
2.34.1

Reply via email to