Add check_bad_page_unlock() to ras_sys_func and racore to support uniras bad page validation
Signed-off-by: Ce Sun <[email protected]> --- .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c | 3 --- .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 19 ++++++++++++++ drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h | 3 +++ drivers/gpu/drm/amd/ras/rascore/ras.h | 2 ++ drivers/gpu/drm/amd/ras/rascore/ras_core.c | 10 +++++++ drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 26 +++++++++++++++---- drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 1 + .../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 3 ++- 8 files changed, 58 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c index cb6498c30834..473b387fa3db 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c @@ -30,9 +30,6 @@ #include "amdgpu_ras_mgr.h" #include "amdgpu_virt_ras_cmd.h" -/* inject address is 52 bits */ -#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) - #define AMDGPU_RAS_TYPE_RASCORE 0x1 #define AMDGPU_RAS_TYPE_AMDGPU 0x2 #define AMDGPU_RAS_TYPE_VF 0x3 diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c index 7d728e523604..cc6d571a5479 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -266,6 +266,24 @@ static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core, return 0; } +static int amdgpu_ras_sys_check_bad_page_unlock(struct ras_core_context *ras_core, + uint64_t addr) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint64_t pfn = addr >> AMDGPU_GPU_PAGE_SHIFT; + + if ((addr >= adev->gmc.mc_vram_size && + adev->gmc.mc_vram_size) || + (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EINVAL; + + if (pfn >= (adev->gmc.real_vram_size >> AMDGPU_GPU_PAGE_SHIFT)) { + RAS_DEV_WARN(ras_core->dev, "Recorded address out of range: 0x%llx!\n", addr); + return -EINVAL; + } + + return 0; +} const struct ras_sys_func amdgpu_ras_sys_fn = { .ras_notifier = amdgpu_ras_sys_event_notifier, @@ -277,4 +295,5 @@ const struct ras_sys_func amdgpu_ras_sys_fn = { .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt, .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem, .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem, + .check_bad_page_unlock = amdgpu_ras_sys_check_bad_page_unlock, }; diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h index 8156531a7b63..239e56732e3e 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h @@ -30,6 +30,9 @@ #include <linux/mempool.h> #include "amdgpu.h" +/* inject address is 52 bits */ +#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) + #define RAS_DEV_ERR(device, fmt, ...) \ do { \ if (device) \ diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h index 6449d7b8627d..6c3697de1f98 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h @@ -231,6 +231,7 @@ struct ras_sys_func { enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); int (*put_gpu_mem)(struct ras_core_context *ras_core, enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); + int (*check_bad_page_unlock)(struct ras_core_context *ras_core, uint64_t addr); }; struct ras_ecc_count { @@ -399,4 +400,5 @@ int ras_core_get_device_system_info(struct ras_core_context *ras_core, struct device_system_info *dev_info); int ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core, uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages); +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core, uint64_t addr); #endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c b/drivers/gpu/drm/amd/ras/rascore/ras_core.c index 29b1b8f0cc26..efd4023f133b 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c @@ -676,3 +676,13 @@ int ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core, return count; } + +int ras_core_check_bad_page_unlock(struct ras_core_context *ras_core, + uint64_t addr) +{ + if (ras_core && ras_core->sys_fn && + ras_core->sys_fn->check_bad_page_unlock) + return ras_core->sys_fn->check_bad_page_unlock(ras_core, addr); + + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c index d4072350f48f..7ff019a8c7a8 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c @@ -373,7 +373,7 @@ static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core, struct ras_umc *ras_umc = &ras_core->ras_umc; struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data; uint64_t page_pfn[16]; - int count = 0, j; + int count = 0, i, j; if (!data->space_left && ras_umc_realloc_err_data_space(ras_core, data, 256)) { @@ -385,6 +385,18 @@ static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core, bps, bps->cur_nps, page_pfn, ARRAY_SIZE(page_pfn)); if (count > 0) { for (j = 0; j < count; j++) { + if (ras_core_check_bad_page_unlock(ras_core, + page_pfn[j] << AMDGPU_GPU_PAGE_SHIFT)) { + + for (i = 0; i < data->count; i++) + if (page_pfn[j] == data->bps[i].cur_nps_retired_row_pfn) + break; + data->bps[data->count].cur_nps_retired_row_pfn = U64_MAX; + data->count++; + data->space_left--; + continue; + } + bps->cur_nps_retired_row_pfn = page_pfn[j]; memcpy(&data->bps[data->count], bps, sizeof(*data->bps)); data->count++; @@ -489,9 +501,11 @@ static int ras_umc_save_bad_pages(struct ras_core_context *ras_core) { struct ras_umc *ras_umc = &ras_core->ras_umc; struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data; - uint32_t eeprom_record_num; + struct eeprom_store_record *ram_data = &ras_umc->umc_err_data.ram_data; + uint32_t eeprom_record_num, logical_count = 0; + uint32_t retire_unit = ras_core->ras_umc.retire_unit; int save_count; - int ret = 0; + int ret = 0, i; if (!data->bps) return 0; @@ -515,8 +529,10 @@ static int ras_umc_save_bad_pages(struct ras_core_context *ras_core) ret = -EIO; goto exit; } - - RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", save_count); + for (i = ram_data->count - retire_unit; i < ram_data->count; i++) + if (ram_data->bps[i].cur_nps_retired_row_pfn != U64_MAX) + logical_count++; + RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", logical_count); } exit: diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h index 1d3026be509b..05edacc165ba 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h @@ -139,6 +139,7 @@ struct ras_umc { u32 pending_ecc_count; /* number of entries dropped because pending_ecc_list was full */ u32 pending_ecc_dropped; + u32 retire_unit; }; /* diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c index b809a2f21d73..0064e89ac1ab 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c @@ -110,6 +110,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context *ras_core, "Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n"); break; } + ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num; } static uint64_t convert_nps_pa_to_row_pa(struct ras_core_context *ras_core, @@ -166,7 +167,7 @@ static int lookup_bad_pages_in_a_row(struct ras_core_context *ras_core, idx = 0; row = 0; - retire_unit = 0x1 << flip_bits.bit_num; + retire_unit = ras_core->ras_umc.retire_unit; /* loop for all possibilities of retire bits */ for (column = 0; column < retire_unit; column++) { soc_pa = row_pa; -- 2.34.1
