By polling, poll ACA bank count to ensure that valid ACA bank reg info can be obtained
v2: add corresponding delay before send msg to SMU to query mca bank info. (Stanley) Signed-off-by: Ce Sun <cesun...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 +--- 4 files changed, 14 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 92c2370831b3..2beaf30ccb96 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -877,7 +877,7 @@ size_t amdgpu_aca_get_bank_count(struct amdgpu_device *adev) void amdgpu_aca_clear_bank_count(struct amdgpu_device *adev) { - atomic64_set(&aca->bank_count, 0); + atomic64_set(&adev->aca.bank_count, 0); } #if defined(CONFIG_DEBUG_FS) static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 185b9e538f98..23f583492bfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3306,8 +3306,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) mutex_init(&ecc_log->lock); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3326,8 +3324,6 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_unlock(&ecc_log->lock); mutex_destroy(&ecc_log->lock); - ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; } #endif @@ -3381,49 +3377,33 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint32_t poison_creation_count) { int ret = 0; - struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + uint32_t timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; + uint64_t prev_de_queried_count = 0; + uint64_t bank_count = 0; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; - ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; - de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; + bank_count = amdgpu_aca_get_bank_count(adev); + if (bank_count) { + prev_de_queried_count = bank_count; + amdgpu_aca_clear_bank_count(adev); + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; } else { - new_detect_count = 0; - } - - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; - - if (timeout) { - if (!--timeout) - break; - msleep(1); - } + --timeout; + msleep(1); } - } while (total_detect_count < need_query_count); + } while (timeout); - if (total_detect_count) + if (prev_de_queried_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7f10a7402160..df93791eb645 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,6 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; }; struct amdgpu_ras { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index e590cbdd8de9..b3bdcf70df2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -581,17 +581,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { - if (ret == -EEXIST) - con->umc_ecc_log.de_queried_count++; - else + if (ret != -EEXIST) dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); - kfree(ecc_err); return ret; } - con->umc_ecc_log.de_queried_count++; - memset(page_pfn, 0, sizeof(page_pfn)); count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, pa_addr, -- 2.34.1