Drop new entries once pending_ecc_count hits RAS_UMC_PENDING_ECC_MAX
(4096) so an ECC storm or repeated UMC error injection cannot exhaust
kernel memory. Dropped events are counted and reported via a
rate-limited warning.

Signed-off-by: Stanley.Yang <[email protected]>
---
 drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h |  9 ++++++
 drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 35 +++++++++++++++++++++++
 drivers/gpu/drm/amd/ras/rascore/ras_umc.h | 12 ++++++++
 3 files changed, 56 insertions(+)

diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h 
b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
index 8156531a7b63..f34dda7ce87b 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
@@ -46,6 +46,15 @@
                        printk(KERN_WARNING fmt, ##__VA_ARGS__);                
           \
        } while (0)
 
+#define RAS_DEV_WARN_RATELIMITED(device, fmt, ...)                             
      \
+       do {                                                                    
   \
+               if (device)                                                     
         \
+                       dev_warn_ratelimited(((struct amdgpu_device 
*)device)->dev,        \
+                               fmt, ##__VA_ARGS__);                            
                \
+               else                                                            
       \
+                       printk_ratelimited(KERN_WARNING fmt, ##__VA_ARGS__);    
           \
+       } while (0)
+
 #define RAS_DEV_INFO(device, fmt, ...)                                         
        \
        do {                                                                    
     \
                if (device)                                                     
           \
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
index d4072350f48f..e8c13e42c2f8 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -193,12 +193,29 @@ static void ras_umc_reserve_eeprom_record(struct 
ras_core_context *ras_core,
 }
 
 /* When gpu reset is ongoing, ecc logging operations will be pended.
+ *
+ * The pending list is bounded by RAS_UMC_PENDING_ECC_MAX so that an ECC
+ * storm or repeated UMC error injection cannot make this list (and the
+ * kernel allocations behind it) grow without bound. Once the limit is
+ * reached, additional events are dropped and counted in
+ * pending_ecc_dropped, with a rate-limited warning emitted.
  */
 int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct 
ras_bank_ecc *bank)
 {
        struct ras_umc *ras_umc = &ras_core->ras_umc;
        struct ras_bank_ecc_node *ecc_node;
 
+       mutex_lock(&ras_umc->pending_ecc_lock);
+       if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+               ras_umc->pending_ecc_dropped++;
+               mutex_unlock(&ras_umc->pending_ecc_lock);
+               RAS_DEV_WARN_RATELIMITED(ras_core->dev,
+                       "pending ECC list full (%u), dropping bad bank event 
(total dropped:%u)\n",
+                       RAS_UMC_PENDING_ECC_MAX, ras_umc->pending_ecc_dropped);
+               return -ENOSPC;
+       }
+       mutex_unlock(&ras_umc->pending_ecc_lock);
+
        ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL);
        if (!ecc_node)
                return -ENOMEM;
@@ -206,7 +223,15 @@ int ras_umc_log_bad_bank_pending(struct ras_core_context 
*ras_core, struct ras_b
        memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
 
        mutex_lock(&ras_umc->pending_ecc_lock);
+       /* re-check under the lock to honor the cap across concurrent callers */
+       if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+               ras_umc->pending_ecc_dropped++;
+               mutex_unlock(&ras_umc->pending_ecc_lock);
+               kfree(ecc_node);
+               return -ENOSPC;
+       }
        list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+       ras_umc->pending_ecc_count++;
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        return 0;
@@ -225,8 +250,16 @@ int ras_umc_log_pending_bad_bank(struct ras_core_context 
*ras_core)
                if (!ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
                        list_del(&ecc_node->node);
                        kfree(ecc_node);
+                       if (ras_umc->pending_ecc_count)
+                               ras_umc->pending_ecc_count--;
                }
        }
+       if (ras_umc->pending_ecc_dropped) {
+               RAS_DEV_WARN(ras_core->dev,
+                       "%u pending ECC bad-bank events were dropped during GPU 
reset\n",
+                       ras_umc->pending_ecc_dropped);
+               ras_umc->pending_ecc_dropped = 0;
+       }
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        return 0;
@@ -611,6 +644,8 @@ int ras_umc_sw_fini(struct ras_core_context *ras_core)
                list_del(&ecc_node->node);
                kfree(ecc_node);
        }
+       ras_umc->pending_ecc_count = 0;
+       ras_umc->pending_ecc_dropped = 0;
        mutex_unlock(&ras_umc->pending_ecc_lock);
 
        mutex_destroy(&ras_umc->tree_lock);
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h 
b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
index 1d3026be509b..237525b46b9b 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -139,8 +139,20 @@ struct ras_umc {
        struct mutex  pending_ecc_lock;
        struct ras_umc_err_data umc_err_data;
        struct list_head pending_ecc_list;
+       /* number of entries currently queued on pending_ecc_list */
+       u32 pending_ecc_count;
+       /* number of entries dropped because pending_ecc_list was full */
+       u32 pending_ecc_dropped;
 };
 
+/*
+ * Upper bound on entries that can be queued on pending_ecc_list while a
+ * GPU reset is in progress. Beyond this, new ECC events are dropped to
+ * prevent unbounded kernel memory growth in case of an ECC storm or
+ * malicious/repeated UMC error injection.
+ */
+#define RAS_UMC_PENDING_ECC_MAX  8192
+
 int ras_umc_sw_init(struct ras_core_context *ras);
 int ras_umc_sw_fini(struct ras_core_context *ras);
 int ras_umc_hw_init(struct ras_core_context *ras);
-- 
2.43.0

Reply via email to