By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained
v2: add corresponding delay before send msg to SMU to query mca bank info. (Stanley) v3: the loop cannot exit. (Thomas) Signed-off-by: Ce Sun <cesun...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 65 +++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 12 +++-- 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 31850a47a41f..9ccc1fbca14f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 50 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms @@ -131,6 +131,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) #define BYPASS_ALLOCATED_ADDRESS 0x0 #define BYPASS_INITIALIZATION_ADDRESS 0x1 +#define MAX_BANK_COUNT 12 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -3306,8 +3308,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) mutex_init(&ecc_log->lock); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_de_count = 0; + ecc_log->creation_de_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3328,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_unlock(&ecc_log->lock); mutex_destroy(&ecc_log->lock); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_de_count = 0; + ecc_log->creation_de_count = 0; } #endif @@ -3381,49 +3383,48 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint32_t poison_creation_count) { int ret = 0; - struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + struct ras_ecc_log_info *ecc_log; + uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; + uint64_t creation_de_count = 0; + uint64_t consumption_de_count = 0; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; + uint64_t bank_count = 0; + uint64_t total_bank_count = 0; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; - ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; + do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; - - de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; + creation_de_count = ecc_log->creation_de_count; + consumption_de_count = ecc_log->consumption_de_count; + + bank_count = amdgpu_aca_get_bank_count(adev); + if (bank_count) { + total_bank_count += bank_count; + amdgpu_aca_clear_bank_count(adev); + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; } else { - new_detect_count = 0; + --timeout; + msleep(20); } - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + if (creation_de_count && consumption_de_count) + break; - if (timeout) { - if (!--timeout) - break; - msleep(1); - } - } - } while (total_detect_count < need_query_count); + if (total_bank_count >= MAX_BANK_COUNT) + break; + } while (timeout); + + ecc_log->creation_de_count = 0; + ecc_log->consumption_de_count = 0; - if (total_detect_count) + if (consumption_de_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6265dac0e1c0..b4eb427409ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,8 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; + uint64_t consumption_de_count; + uint64_t creation_de_count; }; struct amdgpu_ras { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index e590cbdd8de9..11b99095efd3 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -536,8 +536,14 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) + /* only creation/consumption defer error can access here. + * MCA_UMC_HWID_V12_0/MCA_UMC_MCATYPE_V12_0. + * It is the hwid/mactype of the consumption defer error + * */ + if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) { + con->umc_ecc_log.creation_de_count++; return 0; + } if (!status) return 0; @@ -582,7 +588,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { if (ret == -EEXIST) - con->umc_ecc_log.de_queried_count++; + con->umc_ecc_log.consumption_de_count++; else dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); @@ -590,7 +596,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, return ret; } - con->umc_ecc_log.de_queried_count++; + con->umc_ecc_log.consumption_de_count++; memset(page_pfn, 0, sizeof(page_pfn)); count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, -- 2.34.1