Drop new entries once pending_ecc_count hits RAS_UMC_PENDING_ECC_MAX (4096) so an ECC storm or repeated UMC error injection cannot exhaust kernel memory. Dropped events are counted and reported via a rate-limited warning.
Signed-off-by: Stanley.Yang <[email protected]> --- drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h | 9 ++++++ drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 35 +++++++++++++++++++++++ drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 12 ++++++++ 3 files changed, 56 insertions(+) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h index 8156531a7b63..f34dda7ce87b 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h @@ -46,6 +46,15 @@ printk(KERN_WARNING fmt, ##__VA_ARGS__); \ } while (0) +#define RAS_DEV_WARN_RATELIMITED(device, fmt, ...) \ + do { \ + if (device) \ + dev_warn_ratelimited(((struct amdgpu_device *)device)->dev, \ + fmt, ##__VA_ARGS__); \ + else \ + printk_ratelimited(KERN_WARNING fmt, ##__VA_ARGS__); \ + } while (0) + #define RAS_DEV_INFO(device, fmt, ...) \ do { \ if (device) \ diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c index d4072350f48f..e8c13e42c2f8 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c @@ -193,12 +193,29 @@ static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core, } /* When gpu reset is ongoing, ecc logging operations will be pended. + * + * The pending list is bounded by RAS_UMC_PENDING_ECC_MAX so that an ECC + * storm or repeated UMC error injection cannot make this list (and the + * kernel allocations behind it) grow without bound. Once the limit is + * reached, additional events are dropped and counted in + * pending_ecc_dropped, with a rate-limited warning emitted. */ int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank) { struct ras_umc *ras_umc = &ras_core->ras_umc; struct ras_bank_ecc_node *ecc_node; + mutex_lock(&ras_umc->pending_ecc_lock); + if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) { + ras_umc->pending_ecc_dropped++; + mutex_unlock(&ras_umc->pending_ecc_lock); + RAS_DEV_WARN_RATELIMITED(ras_core->dev, + "pending ECC list full (%u), dropping bad bank event (total dropped:%u)\n", + RAS_UMC_PENDING_ECC_MAX, ras_umc->pending_ecc_dropped); + return -ENOSPC; + } + mutex_unlock(&ras_umc->pending_ecc_lock); + ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL); if (!ecc_node) return -ENOMEM; @@ -206,7 +223,15 @@ int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_b memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc)); mutex_lock(&ras_umc->pending_ecc_lock); + /* re-check under the lock to honor the cap across concurrent callers */ + if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) { + ras_umc->pending_ecc_dropped++; + mutex_unlock(&ras_umc->pending_ecc_lock); + kfree(ecc_node); + return -ENOSPC; + } list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list); + ras_umc->pending_ecc_count++; mutex_unlock(&ras_umc->pending_ecc_lock); return 0; @@ -225,8 +250,16 @@ int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core) if (!ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) { list_del(&ecc_node->node); kfree(ecc_node); + if (ras_umc->pending_ecc_count) + ras_umc->pending_ecc_count--; } } + if (ras_umc->pending_ecc_dropped) { + RAS_DEV_WARN(ras_core->dev, + "%u pending ECC bad-bank events were dropped during GPU reset\n", + ras_umc->pending_ecc_dropped); + ras_umc->pending_ecc_dropped = 0; + } mutex_unlock(&ras_umc->pending_ecc_lock); return 0; @@ -611,6 +644,8 @@ int ras_umc_sw_fini(struct ras_core_context *ras_core) list_del(&ecc_node->node); kfree(ecc_node); } + ras_umc->pending_ecc_count = 0; + ras_umc->pending_ecc_dropped = 0; mutex_unlock(&ras_umc->pending_ecc_lock); mutex_destroy(&ras_umc->tree_lock); diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h index 1d3026be509b..237525b46b9b 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h @@ -139,8 +139,20 @@ struct ras_umc { struct mutex pending_ecc_lock; struct ras_umc_err_data umc_err_data; struct list_head pending_ecc_list; + /* number of entries currently queued on pending_ecc_list */ + u32 pending_ecc_count; + /* number of entries dropped because pending_ecc_list was full */ + u32 pending_ecc_dropped; }; +/* + * Upper bound on entries that can be queued on pending_ecc_list while a + * GPU reset is in progress. Beyond this, new ECC events are dropped to + * prevent unbounded kernel memory growth in case of an ECC storm or + * malicious/repeated UMC error injection. + */ +#define RAS_UMC_PENDING_ECC_MAX 8192 + int ras_umc_sw_init(struct ras_core_context *ras); int ras_umc_sw_fini(struct ras_core_context *ras); int ras_umc_hw_init(struct ras_core_context *ras); -- 2.43.0
