Add address sanity check for uniras
Signed-off-by: Ce Sun <[email protected]>
---
.../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c | 3 ---
.../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 18 +++++++++++++
drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h | 3 +++
drivers/gpu/drm/amd/ras/rascore/ras.h | 2 ++
drivers/gpu/drm/amd/ras/rascore/ras_core.c | 10 ++++++++
drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 25 +++++++++++++++++--
drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 1 +
.../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 9 +++++--
8 files changed, 64 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
index cb6498c30834..473b387fa3db 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
@@ -30,9 +30,6 @@
#include "amdgpu_ras_mgr.h"
#include "amdgpu_virt_ras_cmd.h"
-/* inject address is 52 bits */
-#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
-
#define AMDGPU_RAS_TYPE_RASCORE 0x1
#define AMDGPU_RAS_TYPE_AMDGPU 0x2
#define AMDGPU_RAS_TYPE_VF 0x3
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
index 7d728e523604..eb840f0861fe 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -266,6 +266,23 @@ static int amdgpu_ras_sys_put_gpu_mem(struct
ras_core_context *ras_core,
return 0;
}
+static int amdgpu_ras_sys_check_address_sanity(struct ras_core_context
*ras_core,
+ uint64_t addr)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+
+ if ((addr >= adev->gmc.mc_vram_size &&
+ adev->gmc.mc_vram_size) ||
+ (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
+ return -EINVAL;
+
+ if (addr >= adev->gmc.real_vram_size) {
+ RAS_DEV_WARN(ras_core->dev, "Recorded address out of range:
0x%llx!\n", addr);
+ return -EINVAL;
+ }
+
+ return 0;
+}
const struct ras_sys_func amdgpu_ras_sys_fn = {
.ras_notifier = amdgpu_ras_sys_event_notifier,
@@ -277,4 +294,5 @@ const struct ras_sys_func amdgpu_ras_sys_fn = {
.detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
.get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
.put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
+ .check_address_sanity = amdgpu_ras_sys_check_address_sanity,
};
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
index 8156531a7b63..239e56732e3e 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
@@ -30,6 +30,9 @@
#include <linux/mempool.h>
#include "amdgpu.h"
+/* inject address is 52 bits */
+#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
#define RAS_DEV_ERR(device, fmt, ...)
\
do {
\
if (device)
\
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h
b/drivers/gpu/drm/amd/ras/rascore/ras.h
index 6449d7b8627d..44ddb7943a48 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -231,6 +231,7 @@ struct ras_sys_func {
enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
int (*put_gpu_mem)(struct ras_core_context *ras_core,
enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+ int (*check_address_sanity)(struct ras_core_context *ras_core, uint64_t
addr);
};
struct ras_ecc_count {
@@ -399,4 +400,5 @@ int ras_core_get_device_system_info(struct ras_core_context
*ras_core,
struct device_system_info *dev_info);
int ras_core_convert_soc_pa_to_cur_nps_pages(struct ras_core_context *ras_core,
uint64_t soc_pa, uint64_t *page_pfn, uint32_t max_pages);
+int ras_core_check_address_sanity(struct ras_core_context *ras_core, uint64_t
addr);
#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
index 29b1b8f0cc26..cfab7a7d2623 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_core.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -676,3 +676,13 @@ int ras_core_convert_soc_pa_to_cur_nps_pages(struct
ras_core_context *ras_core,
return count;
}
+
+int ras_core_check_address_sanity(struct ras_core_context *ras_core,
+ uint64_t addr)
+{
+ if (ras_core && ras_core->sys_fn &&
+ ras_core->sys_fn->check_address_sanity)
+ return ras_core->sys_fn->check_address_sanity(ras_core, addr);
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
index d4072350f48f..0d4405a975b5 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -480,6 +480,27 @@ int ras_umc_load_bad_pages(struct ras_core_context
*ras_core)
return ret;
}
+static int __calc_bad_page_count(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, const u32 num)
+{
+ uint64_t *pfns;
+ uint32_t pfns_sz = ras_core->ras_umc.retire_unit;
+ int i, ret, count = 0;
+
+ pfns = kcalloc(pfns_sz, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return pfns_sz;
+
+ for (i = 0; i < num; i++) {
+ ret = ras_core_convert_soc_pa_to_cur_nps_pages(ras_core,
+ RAS_PFN_TO_ADDR(record[i].retired_row_pfn), pfns,
pfns_sz);
+ count += (ret <= 0) ? pfns_sz : ret;
+ }
+
+ kfree(pfns);
+ return count;
+}
+
/*
* write error record array to eeprom, the function should be
* protected by recovery_lock
@@ -515,8 +536,8 @@ static int ras_umc_save_bad_pages(struct ras_core_context
*ras_core)
ret = -EIO;
goto exit;
}
-
- RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM
table.\n", save_count);
+ RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n",
+ __calc_bad_page_count(ras_core,
&data->bps[eeprom_record_num], save_count));
}
exit:
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
index 1d3026be509b..05edacc165ba 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -139,6 +139,7 @@ struct ras_umc {
struct mutex pending_ecc_lock;
struct ras_umc_err_data umc_err_data;
struct list_head pending_ecc_list;
+ u32 retire_unit;
};
int ras_umc_sw_init(struct ras_core_context *ras);
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
index b809a2f21d73..fe5f92eb94a1 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
@@ -26,6 +26,8 @@
#include "ras_core_status.h"
#include "ras_umc_v12_0.h"
+#define RAS_UMC_V12_0_ADDR_LIMIT (0x1ULL << 52)
+
#define NumDieInterleaved 4
static const uint32_t umc_v12_0_channel_idx_tbl[]
@@ -110,6 +112,7 @@ static void __get_nps_pa_flip_bits(struct ras_core_context
*ras_core,
"Unknown HBM type, set RAS retire flip bits to the
value in NPS1 mode.\n");
break;
}
+ ras_core->ras_umc.retire_unit = 0x1 << flip_bits->bit_num;
}
static uint64_t convert_nps_pa_to_row_pa(struct ras_core_context *ras_core,
@@ -166,7 +169,7 @@ static int lookup_bad_pages_in_a_row(struct
ras_core_context *ras_core,
idx = 0;
row = 0;
- retire_unit = 0x1 << flip_bits.bit_num;
+ retire_unit = ras_core->ras_umc.retire_unit;
/* loop for all possibilities of retire bits */
for (column = 0; column < retire_unit; column++) {
soc_pa = row_pa;
@@ -186,7 +189,9 @@ static int lookup_bad_pages_in_a_row(struct
ras_core_context *ras_core,
record->cur_nps_bank, record->mem_channel);
- if (pfns && (idx < num))
+ if (pfns && (idx < num) &&
+ (soc_pa < RAS_UMC_V12_0_ADDR_LIMIT) &&
+ !ras_core_check_address_sanity(ras_core, soc_pa))
pfns[idx++] = RAS_ADDR_TO_PFN(soc_pa);
}
--
2.34.1