Add delay work to retire bad pages.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 36 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 +++
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 702229abe7ee..c1f146d3e28d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 
 #define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
 
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2766,6 +2768,30 @@ static void amdgpu_ras_ecc_log_fini(struct 
ras_ecc_log_info *ecc_log)
        ecc_log->de_updated = false;
 }
 
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+       struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+                                             page_retirement_dwork.work);
+       struct amdgpu_device *adev = con->adev;
+       struct ras_err_data err_data;
+
+       if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
+               return;
+
+       amdgpu_ras_error_data_init(&err_data);
+
+       amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+       amdgpu_ras_error_data_fini(&err_data);
+
+       mutex_lock(&con->umc_ecc_log.lock);
+       if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+                               UMC_ECC_NEW_DETECTED_TAG))
+               schedule_delayed_work(&con->page_retirement_dwork,
+                       msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
+       mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
                        enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
 {
@@ -2804,7 +2830,12 @@ static int amdgpu_ras_query_ecc_status(struct 
amdgpu_device *adev,
 static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
                                        uint32_t timeout)
 {
-       amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int ret;
+
+       ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
+       if (!ret)
+               schedule_delayed_work(&con->page_retirement_dwork, 0);
 }
 
 static int amdgpu_ras_page_retirement_thread(void *param)
@@ -2919,6 +2950,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                dev_warn(adev->dev, "Failed to create umc_page_retirement 
thread!!!\n");
        }
 
+       INIT_DELAYED_WORK(&con->page_retirement_dwork, 
amdgpu_ras_do_page_retirement);
        amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
 #ifdef CONFIG_X86_MCE_AMD
 #ifdef HAVE_SMCA_UMC_V2
@@ -2967,6 +2999,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 
        cancel_work_sync(&con->recovery_work);
 
+       cancel_delayed_work_sync(&con->page_retirement_dwork);
+
        amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
 
        mutex_lock(&con->recovery_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 634654cf2634..cb5a0f31d201 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -537,6 +537,7 @@ struct amdgpu_ras {
        struct mutex page_rsv_lock;
        DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
        struct ras_ecc_log_info  umc_ecc_log;
+       struct delayed_work page_retirement_dwork;
 
        /* Fatal error detected flag */
        atomic_t fed;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0f2d765c4e2d..2bd88218c20e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -89,7 +89,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
        return ret;
 }
 
-static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
                        void *ras_error_status)
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index c83d24097c5c..2d08d076f7c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -133,4 +133,7 @@ int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
                uint64_t *pfns, int len, uint64_t *val);
 int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
                struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
+
+void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+                       void *ras_error_status);
 #endif
-- 
2.34.1

Reply via email to