AMD General
> -----Original Message-----
> From: Sun, Ce(Overlord) <[email protected]>
> Sent: Wednesday, June 10, 2026 11:38 AM
> To: [email protected]
> Cc: Zhang, Hawking <[email protected]>; Chai, Thomas
> <[email protected]>; Zhou1, Tao <[email protected]>; Sun, Ce(Overlord)
> <[email protected]>
> Subject: [PATCH] drm/amdgpu/ras: Implement check_bad_page_unlock for uniras
>
> Add check_bad_page_unlock() to ras_sys_func and racore to support uniras bad
> page validation
>
> Signed-off-by: Ce Sun <[email protected]>
> ---
> .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c | 3 ---
> .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 19 ++++++++++++++
> drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h | 3 +++
> drivers/gpu/drm/amd/ras/rascore/ras.h | 2 ++
> drivers/gpu/drm/amd/ras/rascore/ras_core.c | 10 +++++++
> drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 26 +++++++++++++++----
> drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 1 +
> .../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 3 ++-
> 8 files changed, 58 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> index cb6498c30834..473b387fa3db 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> @@ -30,9 +30,6 @@
> #include "amdgpu_ras_mgr.h"
> #include "amdgpu_virt_ras_cmd.h"
>
> -/* inject address is 52 bits */
> -#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
> -
> #define AMDGPU_RAS_TYPE_RASCORE 0x1
> #define AMDGPU_RAS_TYPE_AMDGPU 0x2
> #define AMDGPU_RAS_TYPE_VF 0x3
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> index 7d728e523604..cc6d571a5479 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> @@ -266,6 +266,24 @@ static int amdgpu_ras_sys_put_gpu_mem(struct
> ras_core_context *ras_core,
>
> return 0;
> }
> +static int amdgpu_ras_sys_check_bad_page_unlock(struct ras_core_context
> *ras_core,
> + uint64_t addr)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> + uint64_t pfn = addr >> AMDGPU_GPU_PAGE_SHIFT;
> +
> + if ((addr >= adev->gmc.mc_vram_size &&
> + adev->gmc.mc_vram_size) ||
> + (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
> + return -EINVAL;
> +
> + if (pfn >= (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT)) {
[Tao] why not use addr >= adev->gmc.real_vram_size?
> + RAS_DEV_WARN(ras_core->dev, "Recorded address out of range:
> 0x%llx!\n", addr);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
>
> const struct ras_sys_func amdgpu_ras_sys_fn = {
> .ras_notifier = amdgpu_ras_sys_event_notifier, @@ -277,4 +295,5 @@
> const struct ras_sys_func amdgpu_ras_sys_fn = {
> .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
> .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
> .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
> + .check_bad_page_unlock = amdgpu_ras_sys_check_bad_page_unlock,
> };
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> index 8156531a7b63..239e56732e3e 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> @@ -30,6 +30,9 @@
> #include <linux/mempool.h>
> #include "amdgpu.h"
>
> +/* inject address is 52 bits */
> +#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
> +
> #define RAS_DEV_ERR(device, fmt, ...)
> \
> do {
> \
> if (device)
> \
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
> b/drivers/gpu/drm/amd/ras/rascore/ras.h
> index 6449d7b8627d..6c3697de1f98 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
> @@ -231,6 +231,7 @@ struct ras_sys_func {
> enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
> int (*put_gpu_mem)(struct ras_core_context *ras_core,
> enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
> + int (*check_bad_page_unlock)(struct ras_core_context *ras_core,
> +uint64_t addr);
> };
>
> struct ras_ecc_count {
> @@ -399,4 +400,5 @@ int ras_core_get_device_system_info(struct
> ras_core_context *ras_core,
> struct device_system_info *dev_info); int
> ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
> uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages);
> +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
> +uint64_t addr);
> #endif
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> index 29b1b8f0cc26..efd4023f133b 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> @@ -676,3 +676,13 @@ int ras_core_convert_soc_pa_to_cur_nps_pages(struct
> ras_core_context *ras_core,
>
> return count;
> }
> +
> +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
> + uint64_t addr)
> +{
> + if (ras_core && ras_core->sys_fn &&
> + ras_core->sys_fn->check_bad_page_unlock)
> + return ras_core->sys_fn->check_bad_page_unlock(ras_core, addr);
> +
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> index d4072350f48f..7ff019a8c7a8 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> @@ -373,7 +373,7 @@ static int ras_umc_update_eeprom_ram_data(struct
> ras_core_context *ras_core,
> struct ras_umc *ras_umc = &ras_core->ras_umc;
> struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
> uint64_t page_pfn[16];
> - int count = 0, j;
> + int count = 0, i, j;
>
> if (!data->space_left &&
> ras_umc_realloc_err_data_space(ras_core, data, 256)) { @@ -385,6
> +385,18 @@ static int ras_umc_update_eeprom_ram_data(struct ras_core_context
> *ras_core,
> bps, bps->cur_nps, page_pfn,
> ARRAY_SIZE(page_pfn));
> if (count > 0) {
> for (j = 0; j < count; j++) {
> + if (ras_core_check_bad_page_unlock(ras_core,
> + page_pfn[j] << AMDGPU_GPU_PAGE_SHIFT)) {
> +
> + for (i = 0; i < data->count; i++)
> + if (page_pfn[j] == data-
> >bps[i].cur_nps_retired_row_pfn)
> + break;
> + data->bps[data->count].cur_nps_retired_row_pfn =
> U64_MAX;
> + data->count++;
> + data->space_left--;
> + continue;
> + }
> +
> bps->cur_nps_retired_row_pfn = page_pfn[j];
> memcpy(&data->bps[data->count], bps,
> sizeof(*data->bps));
> data->count++;
> @@ -489,9 +501,11 @@ static int ras_umc_save_bad_pages(struct
> ras_core_context *ras_core) {
> struct ras_umc *ras_umc = &ras_core->ras_umc;
> struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
> - uint32_t eeprom_record_num;
> + struct eeprom_store_record *ram_data = &ras_umc-
> >umc_err_data.ram_data;
> + uint32_t eeprom_record_num, logical_count = 0;
> + uint32_t retire_unit = ras_core->ras_umc.retire_unit;
> int save_count;
> - int ret = 0;
> + int ret = 0, i;
>
> if (!data->bps)
> return 0;
> @@ -515,8 +529,10 @@ static int ras_umc_save_bad_pages(struct
> ras_core_context *ras_core)
> ret = -EIO;
> goto exit;
> }
> -
> - RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n", save_count);
> + for (i = ram_data->count - retire_unit; i < ram_data->count;
> i++)
> + if (ram_data->bps[i].cur_nps_retired_row_pfn != U64_MAX)
> + logical_count++;
[Tao] I prefer to record the count when we update bad pages instead of
traversing the page list for the second time.
BTW, more than one retire_unit pages can be saved in one time.
> + RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n",
> +logical_count);
> }
>
> exit:
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> index 1d3026be509b..05edacc165ba 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> @@ -139,6 +139,7 @@ struct ras_umc {
> u32 pending_ecc_count;
> /* number of entries dropped because pending_ecc_list was full */
> u32 pending_ecc_dropped;
> + u32 retire_unit;
> };
>
> /*
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> index b809a2f21d73..0064e89ac1ab 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> @@ -110,6 +110,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context
> *ras_core,
> "Unknown HBM type, set RAS retire flip bits to the
> value in
> NPS1 mode.\n");
> break;
> }
> + ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num;
> }
>
> static uint64_t convert_nps_pa_to_row_pa(struct ras_core_context *ras_core,
> @@
> -166,7 +167,7 @@ static int lookup_bad_pages_in_a_row(struct ras_core_context
> *ras_core,
>
> idx = 0;
> row = 0;
> - retire_unit = 0x1 << flip_bits.bit_num;
> + retire_unit = ras_core->ras_umc.retire_unit;
> /* loop for all possibilities of retire bits */
> for (column = 0; column < retire_unit; column++) {
> soc_pa = row_pa;
> --
> 2.34.1