amdgpu: Correct the loss of aca bank reg info

Zhang, Hawking Mon, 18 Aug 2025 22:54:36 -0700

[AMD Official Use Only - AMD Internal Distribution Only]


Series is

Reviewed-by: Hawking Zhang <hawking.zh...@amd.com<mailto:hawking.zh...@amd.com>>

Copy @Li, Candice<mailto:candice...@amd.com> for the awareness.

Regards,
Hawking

-----Original Message-----
From: Sun, Ce(Overlord) <ce....@amd.com>
Sent: Monday, August 18, 2025 22:58
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <tao.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; 
Zhang, Hawking <hawking.zh...@amd.com>; Wang, Yang(Kevin) 
<kevinyang.w...@amd.com>; Chai, Thomas <yipeng.c...@amd.com>; Sun, Ce(Overlord) 
<ce....@amd.com>
Subject: [PATCH 4/4 v5] drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid ACA bank reg info can be 
obtained

v2: add corresponding delay before send msg to SMU to query mca bank info.
(Stanley)

v3: the loop cannot exit. (Thomas)

v4: remove amdgpu_aca_clear_bank_count. (Kevin)

v5: continuously inject ce. If a creation interruption occurs at this time, 
bank reg info will be lost. (Thomas)
v5: each cycle is delayed by 5ms. (Tao)

Signed-off-by: Ce Sun <cesun...@amd.com<mailto:cesun...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 ++++++++++++++-----------  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  1 +  
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  9 ++-
 4 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 31850a47a41f..a779336e2fd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  200

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -131,6 +131,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 #define BYPASS_ALLOCATED_ADDRESS        0x0
 #define BYPASS_INITIALIZATION_ADDRESS   0x1

+#define MAX_BANK_COUNT 12
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -3306,8 +3308,8 @@ static void amdgpu_ras_ecc_log_init(struct 
ras_ecc_log_info *ecc_log)
        mutex_init(&ecc_log->lock);

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
+       ecc_log->consumption_q_count = 0;
+       ecc_log->creation_de_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ 
-3326,8 +3328,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info 
*ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_queried_count = 0;
-       ecc_log->prev_de_queried_count = 0;
+       ecc_log->consumption_q_count = 0;
+       ecc_log->creation_de_count = 0;
 }
 #endif

@@ -3381,49 +3383,50 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
                                uint32_t poison_creation_count)
 {
        int ret = 0;
-       struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = 0;
+       struct ras_ecc_log_info *ecc_log;
+       uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint64_t de_queried_count;
-       uint32_t new_detect_count, total_detect_count;
-       uint32_t need_query_count = poison_creation_count;
+       uint64_t creation_de_count = 0;
+       uint64_t consumption_q_count = 0;
        enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+       uint64_t bank_count = 0;
+       uint32_t new_detect_count, total_detect_count;
+       uint64_t pre_bank_count = 0;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;
-
        ecc_log = &ras->umc_ecc_log;
-       total_detect_count = 0;
+
        do {
                ret = amdgpu_ras_query_error_status_with_event(adev, &info, 
type);
                if (ret)
                        return ret;
-
-               de_queried_count = ecc_log->de_queried_count;
-               if (de_queried_count > ecc_log->prev_de_queried_count) {
-                       new_detect_count = de_queried_count - 
ecc_log->prev_de_queried_count;
-                       ecc_log->prev_de_queried_count = de_queried_count;
-                       timeout = 0;
+               creation_de_count = ecc_log->creation_de_count;
+               consumption_q_count = ecc_log->consumption_q_count;
+
+               bank_count = amdgpu_aca_get_bank_count(adev);
+               if (bank_count > pre_bank_count) {
+                       new_detect_count = bank_count - pre_bank_count;
+                       pre_bank_count = bank_count;
+                       total_detect_count += bank_count;
+                       timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
                } else {
-                       new_detect_count = 0;
+                       --timeout;
+                       msleep(5);
                }

-               if (new_detect_count) {
-                       total_detect_count += new_detect_count;
-               } else {
-                       if (!timeout && need_query_count)
-                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+               if (creation_de_count && consumption_q_count)
+                       break;

-                       if (timeout) {
-                               if (!--timeout)
-                                       break;
-                               msleep(1);
-                       }
-               }
-       } while (total_detect_count < need_query_count);
+               if (total_detect_count >= MAX_BANK_COUNT && consumption_q_count)
+                       break;
+       } while (timeout);

-       if (total_detect_count)
+       ecc_log->creation_de_count = 0;
+       ecc_log->consumption_q_count = 0;
+
+       if (creation_de_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);

        return 0;
@@ -3516,7 +3519,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                                atomic_sub(poison_creation_count, 
&con->poison_creation_count);
                                atomic_sub(poison_creation_count, 
&con->page_retirement_req_cnt);
                        }
-               } while (atomic_read(&con->poison_creation_count));
+               } while (atomic_read(&con->poison_creation_count) &&
+!atomic_read(&con->poison_consumption_count));

 #ifdef HAVE_KFIFO_PUT_NON_POINTER
                if (ret != -EIO) {
@@ -3534,6 +3537,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                        /* gpu mode-1 reset is ongoing or just completed ras 
mode-1 reset */
                        /* Clear poison creation request */
                        atomic_set(&con->poison_creation_count, 0);
+                       atomic_set(&con->poison_consumption_count, 0);

                        /* Clear poison fifo */
                        amdgpu_ras_clear_poison_fifo(adev);
@@ -3558,6 +3562,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                                atomic_sub(msg_count, 
&con->page_retirement_req_cnt);
                        }

+                       atomic_set(&con->poison_consumption_count, 0);
+
                        /* Wake up work to save bad pages to eeprom */
                        schedule_delayed_work(&con->page_retirement_dwork, 0);
                }
@@ -3663,6 +3669,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, 
bool init_bp_info)
        init_waitqueue_head(&con->page_retirement_wq);
        atomic_set(&con->page_retirement_req_cnt, 0);
        atomic_set(&con->poison_creation_count, 0);
+       atomic_set(&con->poison_consumption_count, 0);
        con->page_retirement_thread =
                kthread_run(amdgpu_ras_page_retirement_thread, adev, 
"umc_page_retirement");
        if (IS_ERR(con->page_retirement_thread)) { @@ -3723,6 +3730,7 @@ static 
int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)

        atomic_set(&con->page_retirement_req_cnt, 0);
        atomic_set(&con->poison_creation_count, 0);
+       atomic_set(&con->poison_consumption_count, 0);

        mutex_destroy(&con->page_rsv_lock);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6265dac0e1c0..f5797e53f966 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,8 @@ struct ras_ecc_err {  struct ras_ecc_log_info {
        struct mutex lock;
        struct radix_tree_root de_page_tree;
-       uint64_t        de_queried_count;
-       uint64_t        prev_de_queried_count;
+       uint64_t consumption_q_count;
+       uint64_t creation_de_count;
 };

 struct amdgpu_ras {
@@ -557,6 +557,7 @@ struct amdgpu_ras {
        struct mutex page_retirement_lock;
        atomic_t page_retirement_req_cnt;
        atomic_t poison_creation_count;
+       atomic_t poison_consumption_count;
        struct mutex page_rsv_lock;
        DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
        struct ras_ecc_log_info  umc_ecc_log;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index bfc86f1e84e5..983a428eddd4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -254,6 +254,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device 
*adev,
                        if (!ret) {
 #endif
                                atomic_inc(&con->page_retirement_req_cnt);
+                               atomic_inc(&con->poison_consumption_count);
                                wake_up(&con->page_retirement_wq);
 #ifdef HAVE_KFIFO_PUT_NON_POINTER
                        }
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..69c22bdaed3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -536,8 +536,11 @@ static int umc_v12_0_update_ecc_status(struct 
amdgpu_device *adev,
        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);

-       if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0))
+       /* The IP block decode of consumption is SMU */
+       if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) 
{
+               con->umc_ecc_log.consumption_q_count++;
                return 0;
+       }

        if (!status)
                return 0;
@@ -582,7 +585,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device 
*adev,
        ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, 
ecc_err);
        if (ret) {
                if (ret == -EEXIST)
-                       con->umc_ecc_log.de_queried_count++;
+                       con->umc_ecc_log.creation_de_count++;
                else
                        dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", 
ret);

@@ -590,7 +593,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device 
*adev,
                return ret;
        }

-       con->umc_ecc_log.de_queried_count++;
+       con->umc_ecc_log.creation_de_count++;

        memset(page_pfn, 0, sizeof(page_pfn));
        count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
--
2.34.1

RE: [PATCH 4/4 v5] drm/amdgpu: Correct the loss of aca bank reg info

Reply via email to