[AMD Official Use Only - AMD Internal Distribution Only]
Series is Reviewed-by: Hawking Zhang <hawking.zh...@amd.com<mailto:hawking.zh...@amd.com>> Copy @Li, Candice<mailto:candice...@amd.com> for the awareness. Regards, Hawking -----Original Message----- From: Sun, Ce(Overlord) <ce....@amd.com> Sent: Monday, August 18, 2025 22:58 To: amd-gfx@lists.freedesktop.org Cc: Zhou1, Tao <tao.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; Wang, Yang(Kevin) <kevinyang.w...@amd.com>; Chai, Thomas <yipeng.c...@amd.com>; Sun, Ce(Overlord) <ce....@amd.com> Subject: [PATCH 4/4 v5] drm/amdgpu: Correct the loss of aca bank reg info By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained v2: add corresponding delay before send msg to SMU to query mca bank info. (Stanley) v3: the loop cannot exit. (Thomas) v4: remove amdgpu_aca_clear_bank_count. (Kevin) v5: continuously inject ce. If a creation interruption occurs at this time, bank reg info will be lost. (Thomas) v5: each cycle is delayed by 5ms. (Tao) Signed-off-by: Ce Sun <cesun...@amd.com<mailto:cesun...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 ++++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 1 + drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 9 ++- 4 files changed, 51 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 31850a47a41f..a779336e2fd3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 200 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms @@ -131,6 +131,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) #define BYPASS_ALLOCATED_ADDRESS 0x0 #define BYPASS_INITIALIZATION_ADDRESS 0x1 +#define MAX_BANK_COUNT 12 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -3306,8 +3308,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) mutex_init(&ecc_log->lock); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; + ecc_log->creation_de_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3328,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_unlock(&ecc_log->lock); mutex_destroy(&ecc_log->lock); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; + ecc_log->creation_de_count = 0; } #endif @@ -3381,49 +3383,50 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint32_t poison_creation_count) { int ret = 0; - struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + struct ras_ecc_log_info *ecc_log; + uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; + uint64_t creation_de_count = 0; + uint64_t consumption_q_count = 0; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; + uint64_t bank_count = 0; + uint32_t new_detect_count, total_detect_count; + uint64_t pre_bank_count = 0; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; - ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; + do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; - - de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; + creation_de_count = ecc_log->creation_de_count; + consumption_q_count = ecc_log->consumption_q_count; + + bank_count = amdgpu_aca_get_bank_count(adev); + if (bank_count > pre_bank_count) { + new_detect_count = bank_count - pre_bank_count; + pre_bank_count = bank_count; + total_detect_count += bank_count; + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; } else { - new_detect_count = 0; + --timeout; + msleep(5); } - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + if (creation_de_count && consumption_q_count) + break; - if (timeout) { - if (!--timeout) - break; - msleep(1); - } - } - } while (total_detect_count < need_query_count); + if (total_detect_count >= MAX_BANK_COUNT && consumption_q_count) + break; + } while (timeout); - if (total_detect_count) + ecc_log->creation_de_count = 0; + ecc_log->consumption_q_count = 0; + + if (creation_de_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); return 0; @@ -3516,7 +3519,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(poison_creation_count, &con->poison_creation_count); atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); } - } while (atomic_read(&con->poison_creation_count)); + } while (atomic_read(&con->poison_creation_count) && +!atomic_read(&con->poison_consumption_count)); #ifdef HAVE_KFIFO_PUT_NON_POINTER if (ret != -EIO) { @@ -3534,6 +3537,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ /* Clear poison creation request */ atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); /* Clear poison fifo */ amdgpu_ras_clear_poison_fifo(adev); @@ -3558,6 +3562,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(msg_count, &con->page_retirement_req_cnt); } + atomic_set(&con->poison_consumption_count, 0); + /* Wake up work to save bad pages to eeprom */ schedule_delayed_work(&con->page_retirement_dwork, 0); } @@ -3663,6 +3669,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) init_waitqueue_head(&con->page_retirement_wq); atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); con->page_retirement_thread = kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); if (IS_ERR(con->page_retirement_thread)) { @@ -3723,6 +3730,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); mutex_destroy(&con->page_rsv_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6265dac0e1c0..f5797e53f966 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,8 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; + uint64_t consumption_q_count; + uint64_t creation_de_count; }; struct amdgpu_ras { @@ -557,6 +557,7 @@ struct amdgpu_ras { struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; atomic_t poison_creation_count; + atomic_t poison_consumption_count; struct mutex page_rsv_lock; DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); struct ras_ecc_log_info umc_ecc_log; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index bfc86f1e84e5..983a428eddd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -254,6 +254,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, if (!ret) { #endif atomic_inc(&con->page_retirement_req_cnt); + atomic_inc(&con->poison_consumption_count); wake_up(&con->page_retirement_wq); #ifdef HAVE_KFIFO_PUT_NON_POINTER } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index e590cbdd8de9..69c22bdaed3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -536,8 +536,11 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) + /* The IP block decode of consumption is SMU */ + if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) { + con->umc_ecc_log.consumption_q_count++; return 0; + } if (!status) return 0; @@ -582,7 +585,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { if (ret == -EEXIST) - con->umc_ecc_log.de_queried_count++; + con->umc_ecc_log.creation_de_count++; else dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); @@ -590,7 +593,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, return ret; } - con->umc_ecc_log.de_queried_count++; + con->umc_ecc_log.creation_de_count++; memset(page_pfn, 0, sizeof(page_pfn)); count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, -- 2.34.1