Retire bad pages for umc v12_0.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6c2b61ef5b57..bd917eb6ea24 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -28,6 +28,8 @@
 #include "umc/umc_12_0_0_sh_mask.h"
 #include "mp/mp_13_0_6_sh_mask.h"
 
+#define MAX_ECC_NUM_PER_RETIREMENT  16
+
 static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
                                            uint32_t node_inst,
                                            uint32_t umc_inst,
@@ -633,6 +635,58 @@ static int umc_v12_0_update_ecc_status(struct 
amdgpu_device *adev,
        return 0;
 }
 
+static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
+                               struct ras_ecc_err *ecc_err, void 
*ras_error_status)
+{
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+       uint32_t i = 0;
+       int ret = 0;
+
+       if (!err_data || !ecc_err)
+               return -EINVAL;
+
+       for (i = 0; i < ecc_err->err_pages.count; i++) {
+               ret = amdgpu_umc_fill_error_record(err_data,
+                               ecc_err->addr,
+                               ecc_err->err_pages.pfn[i] << 
AMDGPU_GPU_PAGE_SHIFT,
+                               MCA_IPID_2_UMC_CH(ecc_err->ipid),
+                               MCA_IPID_2_UMC_INST(ecc_err->ipid));
+               if (ret)
+                       break;
+       }
+
+       err_data->de_count++;
+
+       return ret;
+}
+
+static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
+                                       void *ras_error_status)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
+       struct radix_tree_root *ecc_tree;
+       int new_detected, ret, i;
+
+       ecc_tree = &con->umc_ecc_log.de_page_tree;
+
+       mutex_lock(&con->umc_ecc_log.lock);
+       new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
+                       0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
+       for (i = 0; i < new_detected; i++) {
+               if (!entries[i])
+                       continue;
+
+               ret = umc_v12_0_fill_error_record(adev, entries[i], 
ras_error_status);
+               if (ret) {
+                       dev_err(adev->dev, "Fail to fill umc error record, 
ret:%d\n", ret);
+                       break;
+               }
+               radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, 
UMC_ECC_NEW_DETECTED_TAG);
+       }
+       mutex_unlock(&con->umc_ecc_log.lock);
+}
+
 struct amdgpu_umc_ras umc_v12_0_ras = {
        .ras_block = {
                .hw_ops = &umc_v12_0_ras_hw_ops,
@@ -640,8 +694,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
        },
        .err_cnt_init = umc_v12_0_err_cnt_init,
        .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
-       .ecc_info_query_ras_error_count = 
umc_v12_0_ecc_info_query_ras_error_count,
-       .ecc_info_query_ras_error_address = 
umc_v12_0_ecc_info_query_ras_error_address,
+       .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
        .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
        .update_ecc_status = umc_v12_0_update_ecc_status,
 };
-- 
2.34.1

Reply via email to