AMD General

Best Regards,
Thomas
-----Original Message-----
From: Sun, Ce(Overlord) <[email protected]>
Sent: Wednesday, June 10, 2026 11:38 AM
To: [email protected]
Cc: Zhang, Hawking <[email protected]>; Chai, Thomas <[email protected]>; 
Zhou1, Tao <[email protected]>; Sun, Ce(Overlord) <[email protected]>
Subject: [PATCH] drm/amdgpu/ras: Implement check_bad_page_unlock for uniras

Add check_bad_page_unlock() to ras_sys_func and racore to support uniras bad 
page validation

Signed-off-by: Ce Sun <[email protected]>
---
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c  |  3 ---  
.../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c  | 19 ++++++++++++++
 drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h     |  3 +++
 drivers/gpu/drm/amd/ras/rascore/ras.h         |  2 ++
 drivers/gpu/drm/amd/ras/rascore/ras_core.c    | 10 +++++++
 drivers/gpu/drm/amd/ras/rascore/ras_umc.c     | 26 +++++++++++++++----
 drivers/gpu/drm/amd/ras/rascore/ras_umc.h     |  1 +
 .../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c   |  3 ++-
 8 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c 
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
index cb6498c30834..473b387fa3db 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
@@ -30,9 +30,6 @@
 #include "amdgpu_ras_mgr.h"
 #include "amdgpu_virt_ras_cmd.h"

-/* inject address is 52 bits */
-#define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
-
 #define AMDGPU_RAS_TYPE_RASCORE  0x1
 #define AMDGPU_RAS_TYPE_AMDGPU   0x2
 #define AMDGPU_RAS_TYPE_VF       0x3
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c 
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
index 7d728e523604..cc6d571a5479 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -266,6 +266,24 @@ static int amdgpu_ras_sys_put_gpu_mem(struct 
ras_core_context *ras_core,

        return 0;
 }
+static int amdgpu_ras_sys_check_bad_page_unlock(struct ras_core_context 
*ras_core,
+                                               uint64_t addr)


【Thomas】Suggest renaming xxx_check_bad_page_unlock → xxx_check_address_sanity 
for clarity.  same applies below.

+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+       uint64_t pfn = addr >> AMDGPU_GPU_PAGE_SHIFT;
+
+       if ((addr >= adev->gmc.mc_vram_size &&
+           adev->gmc.mc_vram_size) ||
+           (addr >= RAS_UMC_INJECT_ADDR_LIMIT))

[Thomas]  Different UMC IPs may have different address ranges, so this check 
should ideally be moved into the corresponding UMC IP file.

+               return -EINVAL;
+
+       if (pfn >= (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT)) {
+               RAS_DEV_WARN(ras_core->dev, "Recorded address out of range: 
0x%llx!\n", addr);
+               return -EINVAL;
+       }
+
+       return 0;
+}

 const struct ras_sys_func amdgpu_ras_sys_fn = {
        .ras_notifier = amdgpu_ras_sys_event_notifier, @@ -277,4 +295,5 @@ 
const struct ras_sys_func amdgpu_ras_sys_fn = {
        .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
        .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
        .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
+       .check_bad_page_unlock = amdgpu_ras_sys_check_bad_page_unlock,
 };
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h 
b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
index 8156531a7b63..239e56732e3e 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
@@ -30,6 +30,9 @@
 #include <linux/mempool.h>
 #include "amdgpu.h"

+/* inject address is 52 bits */
+#define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
+
 #define RAS_DEV_ERR(device, fmt, ...)                                          
     \
        do {                                                                    
  \
                if (device)                                                     
        \
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h 
b/drivers/gpu/drm/amd/ras/rascore/ras.h
index 6449d7b8627d..6c3697de1f98 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -231,6 +231,7 @@ struct ras_sys_func {
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
        int (*put_gpu_mem)(struct ras_core_context *ras_core,
                enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+       int (*check_bad_page_unlock)(struct ras_core_context *ras_core,
+uint64_t addr);
 };

 struct ras_ecc_count {
@@ -399,4 +400,5 @@ int ras_core_get_device_system_info(struct ras_core_context 
*ras_core,
                struct device_system_info *dev_info);  int 
ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
                uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages);
+int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
+uint64_t addr);
 #endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c 
b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
index 29b1b8f0cc26..efd4023f133b 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -676,3 +676,13 @@ int ras_core_convert_soc_pa_to_cur_nps_pages(struct 
ras_core_context *ras_core,

        return count;
 }
+
+int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
+               uint64_t addr)
+{
+       if (ras_core && ras_core->sys_fn &&
+               ras_core->sys_fn->check_bad_page_unlock)
+               return ras_core->sys_fn->check_bad_page_unlock(ras_core, addr);
+
+       return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
index d4072350f48f..7ff019a8c7a8 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -373,7 +373,7 @@ static int ras_umc_update_eeprom_ram_data(struct 
ras_core_context *ras_core,
        struct ras_umc *ras_umc = &ras_core->ras_umc;
        struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
        uint64_t page_pfn[16];
-       int count = 0, j;
+       int count = 0, i, j;

        if (!data->space_left &&
                ras_umc_realloc_err_data_space(ras_core, data, 256)) { @@ 
-385,6 +385,18 @@ static int ras_umc_update_eeprom_ram_data(struct 
ras_core_context *ras_core,
                                        bps, bps->cur_nps, page_pfn, 
ARRAY_SIZE(page_pfn));
        if (count > 0) {
                for (j = 0; j < count; j++) {
+                       if (ras_core_check_bad_page_unlock(ras_core,
+                               page_pfn[j] << AMDGPU_GPU_PAGE_SHIFT)) {
+
+                               for (i = 0; i < data->count; i++)
+                                       if (page_pfn[j] == 
data->bps[i].cur_nps_retired_row_pfn)
+                                               break;
+                               data->bps[data->count].cur_nps_retired_row_pfn 
= U64_MAX;
+                               data->count++;
+                               data->space_left--;
+                               continue;
+                       }
+

【Thomas】As we discussed offline, let's see if this can be further optimized.

                        bps->cur_nps_retired_row_pfn = page_pfn[j];
                        memcpy(&data->bps[data->count], bps, 
sizeof(*data->bps));
                        data->count++;
@@ -489,9 +501,11 @@ static int ras_umc_save_bad_pages(struct ras_core_context 
*ras_core)  {
        struct ras_umc *ras_umc = &ras_core->ras_umc;
        struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
-       uint32_t eeprom_record_num;
+       struct eeprom_store_record *ram_data = &ras_umc->umc_err_data.ram_data;
+       uint32_t eeprom_record_num, logical_count = 0;
+       uint32_t retire_unit = ras_core->ras_umc.retire_unit;
        int save_count;
-       int ret = 0;
+       int ret = 0, i;

        if (!data->bps)
                return 0;
@@ -515,8 +529,10 @@ static int ras_umc_save_bad_pages(struct ras_core_context 
*ras_core)
                        ret = -EIO;
                        goto exit;
                }
-
-               RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM 
table.\n", save_count);
+               for (i = ram_data->count - retire_unit; i < ram_data->count; 
i++)
+                       if (ram_data->bps[i].cur_nps_retired_row_pfn != U64_MAX)
+                               logical_count++;
+               RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n",
+logical_count);
【Thomas】As we discussed offline, let's see if this can be further optimized.
        }

 exit:
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
index 1d3026be509b..05edacc165ba 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -139,6 +139,7 @@ struct ras_umc {
        u32 pending_ecc_count;
        /* number of entries dropped because pending_ecc_list was full */
        u32 pending_ecc_dropped;
+       u32 retire_unit;
 };

 /*
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
index b809a2f21d73..0064e89ac1ab 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
@@ -110,6 +110,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context 
*ras_core,
                        "Unknown HBM type, set RAS retire flip bits to the 
value in NPS1 mode.\n");
                break;
        }
+       ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num;
 }

 static uint64_t  convert_nps_pa_to_row_pa(struct ras_core_context *ras_core, 
@@ -166,7 +167,7 @@ static int lookup_bad_pages_in_a_row(struct 
ras_core_context *ras_core,

        idx = 0;
        row = 0;
-       retire_unit = 0x1 << flip_bits.bit_num;
+       retire_unit = ras_core->ras_umc.retire_unit;
        /* loop for all possibilities of retire bits */
        for (column = 0; column < retire_unit; column++) {
                soc_pa = row_pa;
--
2.34.1

Reply via email to