AMD General

> -----Original Message-----
> From: Sun, Ce(Overlord) <[email protected]>
> Sent: Wednesday, June 10, 2026 11:38 AM
> To: [email protected]
> Cc: Zhang, Hawking <[email protected]>; Chai, Thomas
> <[email protected]>; Zhou1, Tao <[email protected]>; Sun, Ce(Overlord)
> <[email protected]>
> Subject: [PATCH] drm/amdgpu/ras: Implement check_bad_page_unlock for uniras
>
> Add check_bad_page_unlock() to ras_sys_func and racore to support uniras bad
> page validation
>
> Signed-off-by: Ce Sun <[email protected]>
> ---
>  .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c  |  3 ---
>   .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c  | 19 ++++++++++++++
>  drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h     |  3 +++
>  drivers/gpu/drm/amd/ras/rascore/ras.h         |  2 ++
>  drivers/gpu/drm/amd/ras/rascore/ras_core.c    | 10 +++++++
>  drivers/gpu/drm/amd/ras/rascore/ras_umc.c     | 26 +++++++++++++++----
>  drivers/gpu/drm/amd/ras/rascore/ras_umc.h     |  1 +
>  .../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c   |  3 ++-
>  8 files changed, 58 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> index cb6498c30834..473b387fa3db 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
> @@ -30,9 +30,6 @@
>  #include "amdgpu_ras_mgr.h"
>  #include "amdgpu_virt_ras_cmd.h"
>
> -/* inject address is 52 bits */
> -#define      RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
> -
>  #define AMDGPU_RAS_TYPE_RASCORE  0x1
>  #define AMDGPU_RAS_TYPE_AMDGPU   0x2
>  #define AMDGPU_RAS_TYPE_VF       0x3
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> index 7d728e523604..cc6d571a5479 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
> @@ -266,6 +266,24 @@ static int amdgpu_ras_sys_put_gpu_mem(struct
> ras_core_context *ras_core,
>
>       return 0;
>  }
> +static int amdgpu_ras_sys_check_bad_page_unlock(struct ras_core_context
> *ras_core,
> +                                             uint64_t addr)
> +{
> +     struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
> +     uint64_t pfn = addr >> AMDGPU_GPU_PAGE_SHIFT;
> +
> +     if ((addr >= adev->gmc.mc_vram_size &&
> +         adev->gmc.mc_vram_size) ||
> +         (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
> +             return -EINVAL;
> +
> +     if (pfn >= (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT)) {

[Tao] why not use addr >= adev->gmc.real_vram_size?

> +             RAS_DEV_WARN(ras_core->dev, "Recorded address out of range:
> 0x%llx!\n", addr);
> +             return -EINVAL;
> +     }
> +
> +     return 0;
> +}
>
>  const struct ras_sys_func amdgpu_ras_sys_fn = {
>       .ras_notifier = amdgpu_ras_sys_event_notifier, @@ -277,4 +295,5 @@
> const struct ras_sys_func amdgpu_ras_sys_fn = {
>       .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
>       .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
>       .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
> +     .check_bad_page_unlock = amdgpu_ras_sys_check_bad_page_unlock,
>  };
> diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> index 8156531a7b63..239e56732e3e 100644
> --- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
> @@ -30,6 +30,9 @@
>  #include <linux/mempool.h>
>  #include "amdgpu.h"
>
> +/* inject address is 52 bits */
> +#define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
> +
>  #define RAS_DEV_ERR(device, fmt, ...)                                        
>        \
>       do {                                                                    
>   \
>               if (device)                                                     
>         \
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
> b/drivers/gpu/drm/amd/ras/rascore/ras.h
> index 6449d7b8627d..6c3697de1f98 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
> @@ -231,6 +231,7 @@ struct ras_sys_func {
>               enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
>       int (*put_gpu_mem)(struct ras_core_context *ras_core,
>               enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
> +     int (*check_bad_page_unlock)(struct ras_core_context *ras_core,
> +uint64_t addr);
>  };
>
>  struct ras_ecc_count {
> @@ -399,4 +400,5 @@ int ras_core_get_device_system_info(struct
> ras_core_context *ras_core,
>               struct device_system_info *dev_info);  int
> ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
>               uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages);
> +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
> +uint64_t addr);
>  #endif
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> index 29b1b8f0cc26..efd4023f133b 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
> @@ -676,3 +676,13 @@ int ras_core_convert_soc_pa_to_cur_nps_pages(struct
> ras_core_context *ras_core,
>
>       return count;
>  }
> +
> +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core,
> +             uint64_t addr)
> +{
> +     if (ras_core && ras_core->sys_fn &&
> +             ras_core->sys_fn->check_bad_page_unlock)
> +             return ras_core->sys_fn->check_bad_page_unlock(ras_core, addr);
> +
> +     return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> index d4072350f48f..7ff019a8c7a8 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> @@ -373,7 +373,7 @@ static int ras_umc_update_eeprom_ram_data(struct
> ras_core_context *ras_core,
>       struct ras_umc *ras_umc = &ras_core->ras_umc;
>       struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
>       uint64_t page_pfn[16];
> -     int count = 0, j;
> +     int count = 0, i, j;
>
>       if (!data->space_left &&
>               ras_umc_realloc_err_data_space(ras_core, data, 256)) { @@ -385,6
> +385,18 @@ static int ras_umc_update_eeprom_ram_data(struct ras_core_context
> *ras_core,
>                                       bps, bps->cur_nps, page_pfn,
> ARRAY_SIZE(page_pfn));
>       if (count > 0) {
>               for (j = 0; j < count; j++) {
> +                     if (ras_core_check_bad_page_unlock(ras_core,
> +                             page_pfn[j] << AMDGPU_GPU_PAGE_SHIFT)) {
> +
> +                             for (i = 0; i < data->count; i++)
> +                                     if (page_pfn[j] == data-
> >bps[i].cur_nps_retired_row_pfn)
> +                                             break;
> +                             data->bps[data->count].cur_nps_retired_row_pfn =
> U64_MAX;
> +                             data->count++;
> +                             data->space_left--;
> +                             continue;
> +                     }
> +
>                       bps->cur_nps_retired_row_pfn = page_pfn[j];
>                       memcpy(&data->bps[data->count], bps, 
> sizeof(*data->bps));
>                       data->count++;
> @@ -489,9 +501,11 @@ static int ras_umc_save_bad_pages(struct
> ras_core_context *ras_core)  {
>       struct ras_umc *ras_umc = &ras_core->ras_umc;
>       struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
> -     uint32_t eeprom_record_num;
> +     struct eeprom_store_record *ram_data = &ras_umc-
> >umc_err_data.ram_data;
> +     uint32_t eeprom_record_num, logical_count = 0;
> +     uint32_t retire_unit = ras_core->ras_umc.retire_unit;
>       int save_count;
> -     int ret = 0;
> +     int ret = 0, i;
>
>       if (!data->bps)
>               return 0;
> @@ -515,8 +529,10 @@ static int ras_umc_save_bad_pages(struct
> ras_core_context *ras_core)
>                       ret = -EIO;
>                       goto exit;
>               }
> -
> -             RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n", save_count);
> +             for (i = ram_data->count - retire_unit; i < ram_data->count; 
> i++)
> +                     if (ram_data->bps[i].cur_nps_retired_row_pfn != U64_MAX)
> +                             logical_count++;

 [Tao] I prefer to record the count when we update bad pages instead of 
traversing the page list for the second time.
BTW, more than one retire_unit pages can be saved in one time.

> +             RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n",
> +logical_count);
>       }
>
>  exit:
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> index 1d3026be509b..05edacc165ba 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> @@ -139,6 +139,7 @@ struct ras_umc {
>       u32 pending_ecc_count;
>       /* number of entries dropped because pending_ecc_list was full */
>       u32 pending_ecc_dropped;
> +     u32 retire_unit;
>  };
>
>  /*
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> index b809a2f21d73..0064e89ac1ab 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> @@ -110,6 +110,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context
> *ras_core,
>                       "Unknown HBM type, set RAS retire flip bits to the 
> value in
> NPS1 mode.\n");
>               break;
>       }
> +     ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num;
>  }
>
>  static uint64_t  convert_nps_pa_to_row_pa(struct ras_core_context *ras_core, 
> @@
> -166,7 +167,7 @@ static int lookup_bad_pages_in_a_row(struct ras_core_context
> *ras_core,
>
>       idx = 0;
>       row = 0;
> -     retire_unit = 0x1 << flip_bits.bit_num;
> +     retire_unit = ras_core->ras_umc.retire_unit;
>       /* loop for all possibilities of retire bits */
>       for (column = 0; column < retire_unit; column++) {
>               soc_pa = row_pa;
> --
> 2.34.1

Reply via email to