MCA supports recording umc address information.

V2:
  Move err_addr variable from struct ras_err_node to
struct ras_err_info.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c  | 13 +++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 22 +++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  | 13 +++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c   |  4 ++--
 8 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 210aea590a52..8911310f98df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct 
amdgpu_device *adev, int idx, st
 int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum 
amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data 
*err_data)
 {
        struct amdgpu_smuio_mcm_config_info mcm_info;
+       struct ras_err_addr err_addr = {0};
        struct mca_bank_set mca_set;
        struct mca_bank_node *node;
        struct mca_bank_entry *entry;
@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device 
*adev, enum amdgpu_ras_blo
                mcm_info.socket_id = entry->info.socket_id;
                mcm_info.die_id = entry->info.aid;
 
+               if (blk == AMDGPU_RAS_BLOCK__UMC) {
+                       err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
+                       err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
+                       err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
+               }
+
                if (type == AMDGPU_MCA_ERROR_TYPE_UE)
-                       amdgpu_ras_error_statistic_ue_count(err_data, 
&mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ue_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
                else
-                       amdgpu_ras_error_statistic_ce_count(err_data, 
&mcm_info, (uint64_t)count);
+                       amdgpu_ras_error_statistic_ce_count(err_data,
+                               &mcm_info, &err_addr, (uint64_t)count);
        }
 
 out_mca_release:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bacb59d8b701..bad62141f708 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1156,8 +1156,10 @@ static void 
amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
                for_each_ras_error(err_node, err_data) {
                        err_info = &err_node->err_info;
 
-                       amdgpu_ras_error_statistic_ce_count(&obj->err_data, 
&err_info->mcm_info, err_info->ce_count);
-                       amdgpu_ras_error_statistic_ue_count(&obj->err_data, 
&err_info->mcm_info, err_info->ue_count);
+                       amdgpu_ras_error_statistic_ce_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, 
err_info->ce_count);
+                       amdgpu_ras_error_statistic_ue_count(&obj->err_data,
+                                       &err_info->mcm_info, NULL, 
err_info->ue_count);
                }
        } else {
                /* for legacy asic path which doesn't has error source info */
@@ -3691,7 +3693,8 @@ static int ras_err_info_cmp(void *priv, const struct 
list_head *a, const struct
 }
 
 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data 
*err_data,
-                                                     struct 
amdgpu_smuio_mcm_config_info *mcm_info)
+                               struct amdgpu_smuio_mcm_config_info *mcm_info,
+                               struct ras_err_addr *err_addr)
 {
        struct ras_err_node *err_node;
 
@@ -3705,6 +3708,9 @@ static struct ras_err_info 
*amdgpu_ras_error_get_info(struct ras_err_data *err_d
 
        memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
 
+       if (err_addr)
+               memcpy(&err_node->err_info.err_addr, err_addr, 
sizeof(*err_addr));
+
        err_data->err_list_count++;
        list_add_tail(&err_node->node, &err_data->err_node_list);
        list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
@@ -3713,7 +3719,8 @@ static struct ras_err_info 
*amdgpu_ras_error_get_info(struct ras_err_data *err_d
 }
 
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info 
*mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3723,7 +3730,7 @@ int amdgpu_ras_error_statistic_ue_count(struct 
ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
@@ -3734,7 +3741,8 @@ int amdgpu_ras_error_statistic_ue_count(struct 
ras_err_data *err_data,
 }
 
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info 
*mcm_info, u64 count)
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count)
 {
        struct ras_err_info *err_info;
 
@@ -3744,7 +3752,7 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
        if (!count)
                return 0;
 
-       err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
+       err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
        if (!err_info)
                return -EINVAL;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6a941eb8fb8f..76fb85628716 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -452,10 +452,17 @@ struct ras_fs_data {
        char debugfs_name[32];
 };
 
+struct ras_err_addr {
+       uint64_t err_status;
+       uint64_t err_ipid;
+       uint64_t err_addr;
+};
+
 struct ras_err_info {
        struct amdgpu_smuio_mcm_config_info mcm_info;
        u64 ce_count;
        u64 ue_count;
+       struct ras_err_addr err_addr;
 };
 
 struct ras_err_node {
@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct 
amdgpu_device *adev,
 int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info 
*mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
-                                       struct amdgpu_smuio_mcm_config_info 
*mcm_info, u64 count);
+               struct amdgpu_smuio_mcm_config_info *mcm_info,
+               struct ras_err_addr *err_addr, u64 count);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 9a95b9f226b8..a6c88f2fe6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1313,10 +1313,10 @@ static void __xgmi_v6_4_0_query_error_count(struct 
amdgpu_device *adev, struct a
 
        switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
        case AMDGPU_MCA_ERROR_TYPE_UE:
-               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 
1ULL);
                break;
        case AMDGPU_MCA_ERROR_TYPE_CE:
-               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
+               amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 
1ULL);
                break;
        default:
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 00b21ece081f..131cddbdda0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3828,8 +3828,8 @@ static void gfx_v9_4_3_inst_query_ras_err_count(struct 
amdgpu_device *adev,
        /* the caller should make sure initialize value of
         * err_data->ue_count and err_data->ce_count
         */
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, 
ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, 
ce_count);
 }
 
 static void gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index 9b0146732e13..fb53aacdcba2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -652,8 +652,8 @@ static void mmhub_v1_8_inst_query_ras_error_count(struct 
amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, 
ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, 
ue_count);
 }
 
 static void mmhub_v1_8_query_ras_error_count(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 0f24af6f2810..2d688dca26be 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2156,7 +2156,7 @@ static void sdma_v4_4_2_inst_query_ras_error_count(struct 
amdgpu_device *adev,
                                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
                                        &ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, 
ue_count);
 }
 
 static void sdma_v4_4_2_query_ras_error_count(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e9c2ff74f0bc..8d60c39ae1c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -166,8 +166,8 @@ static int umc_v12_0_query_error_count(struct amdgpu_device 
*adev,
        umc_v12_0_query_correctable_error_count(adev, umc_reg_offset, 
&ce_count);
        umc_v12_0_query_uncorrectable_error_count(adev, umc_reg_offset, 
&ue_count);
 
-       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, ue_count);
-       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, ce_count);
+       amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, 
ue_count);
+       amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, 
ce_count);
 
        return 0;
 }
-- 
2.34.1

Reply via email to