[AMD Official Use Only - General]

+       if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+           (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+               report->type = type;
+               report->count = 
ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+       }

Gentle reminder that we should be able to extend the error logging to all the 
pcs errors. Just read back the config registers so we know which error is 
configured to UE and which error is configured to CE.

Regards,
Hawking


-----Original Message-----
From: Wang, Yang(Kevin) <[email protected]>
Sent: Wednesday, January 3, 2024 16:02
To: [email protected]
Cc: Zhang, Hawking <[email protected]>; Zhou1, Tao <[email protected]>; 
Chai, Thomas <[email protected]>; Wang, Yang(Kevin) <[email protected]>
Subject: [PATCH 11/14] drm/amdgpu: add xgmi v6.4.0 ACA support

add xgmi v6.4.0 ACA driver support

Signed-off-by: Yang Wang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 63 +++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a6c88f2fe6e5..61208ca94442 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1035,15 +1035,76 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device 
*adev)
        return 0;
 }

+static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, 
struct aca_bank *bank, enum aca_error_type type,
+                                               struct aca_bank_report *report, 
void *data) {
+       struct amdgpu_device *adev = handle->adev;
+       const char *error_str;
+       u64 status;
+       int ret, ext_error_code;
+
+       ret = aca_bank_info_decode(bank, &report->info);
+       if (ret)
+               return ret;
+
+       status = bank->regs[MCA_REG_IDX_STATUS];
+       ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status);
+
+       error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) 
?
+               xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
+       if (error_str)
+               dev_info(adev->dev, "%s detected\n", error_str);
+
+       if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
+           (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) {
+               report->type = type;
+               report->count = 
ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+       }
+
+       return 0;
+}
+
+static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
+       .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
+};
+
+static const struct aca_info xgmi_v6_4_0_aca_info = {
+       .hwip = ACA_HWIP_TYPE_PCS_XGMI,
+       .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
+       .bank_ops = &xgmi_v6_4_0_aca_bank_ops, };
+
 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct 
ras_common_if *ras_block)  {
+       int r;
+
        if (!adev->gmc.xgmi.supported ||
            adev->gmc.xgmi.num_physical_nodes == 0)
                return 0;

        amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);

-       return amdgpu_ras_block_late_init(adev, ras_block);
+       r = amdgpu_ras_block_late_init(adev, ras_block);
+       if (r)
+               return r;
+
+       switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+       case IP_VERSION(6, 4, 0):
+               r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
+                                       &xgmi_v6_4_0_aca_info, NULL);
+               if (r)
+                       goto late_fini;
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+
+late_fini:
+       amdgpu_ras_block_late_fini(adev, ras_block);
+
+       return r;
 }

 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
--
2.34.1

Reply via email to