AMD General

As discussed offline, the saved page number can be smaller than save_count * 
ras_core->ras_umc.retire_unit since page address could be invalid and needs to 
be excluded.

Tao

> -----Original Message-----
> From: Sun, Ce(Overlord) <[email protected]>
> Sent: Wednesday, June 3, 2026 10:52 AM
> To: [email protected]
> Cc: Zhang, Hawking <[email protected]>; Chai, Thomas
> <[email protected]>; Zhou1, Tao <[email protected]>; Yang, Stanley
> <[email protected]>; Sun, Ce(Overlord) <[email protected]>
> Subject: [PATCH v1 2/2] drm/amdgpu/ras: adjust the update of RAS bad page
> number
>
> One eeprom record may not map to unit number of bad pages, Correct the 
> relevant
> update logic accordingly
>
> Signed-off-by: Ce Sun <[email protected]>
> ---
>  drivers/gpu/drm/amd/ras/rascore/ras_umc.c       | 3 ++-
>  drivers/gpu/drm/amd/ras/rascore/ras_umc.h       | 2 ++
>  drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 3 ++-
>  3 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> index e5971c3dd7da..11490048a282 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
> @@ -548,7 +548,8 @@ static int ras_umc_save_bad_pages(struct
> ras_core_context *ras_core)
>                       goto exit;
>               }
>
> -             RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n", save_count);
> +             RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
> table.\n",
> +                             save_count * ras_core->ras_umc.retire_unit);
>       }
>
>  exit:
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> index 237525b46b9b..e9e34bbdbd30 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
> @@ -143,6 +143,8 @@ struct ras_umc {
>       u32 pending_ecc_count;
>       /* number of entries dropped because pending_ecc_list was full */
>       u32 pending_ecc_dropped;
> +     /* how many pages are retired */
> +     u32 retire_unit;
>  };
>
>  /*
> diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> index b809a2f21d73..0064e89ac1ab 100644
> --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
> @@ -110,6 +110,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context
> *ras_core,
>                       "Unknown HBM type, set RAS retire flip bits to the 
> value in
> NPS1 mode.\n");
>               break;
>       }
> +     ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num;
>  }
>
>  static uint64_t  convert_nps_pa_to_row_pa(struct ras_core_context *ras_core, 
> @@
> -166,7 +167,7 @@ static int lookup_bad_pages_in_a_row(struct ras_core_context
> *ras_core,
>
>       idx = 0;
>       row = 0;
> -     retire_unit = 0x1 << flip_bits.bit_num;
> +     retire_unit = ras_core->ras_umc.retire_unit;
>       /* loop for all possibilities of retire bits */
>       for (column = 0; column < retire_unit; column++) {
>               soc_pa = row_pa;
> --
> 2.34.1

Reply via email to