Add ras aca parser v1.0. Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> Reviewed-by: Tao Zhou <tao.zh...@amd.com> --- .../gpu/drm/amd/ras/rascore/ras_aca_v1_0.c | 379 ++++++++++++++++++ .../gpu/drm/amd/ras/rascore/ras_aca_v1_0.h | 71 ++++ 2 files changed, 450 insertions(+) create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c create mode 100644 drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c new file mode 100644 index 000000000000..29df98948703 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_aca.h" +#include "ras_core_status.h" +#include "ras_aca_v1_0.h" + +struct ras_aca_hwip { + int hwid; + int mcatype; +}; + +static struct ras_aca_hwip aca_hwid_mcatypes[ACA_ECC_HWIP_COUNT] = { + [ACA_ECC_HWIP__SMU] = {0x01, 0x01}, + [ACA_ECC_HWIP__PCS_XGMI] = {0x50, 0x00}, + [ACA_ECC_HWIP__UMC] = {0x96, 0x00}, +}; + +static int aca_decode_bank_info(struct aca_block *aca_blk, + struct aca_bank_reg *bank, struct aca_ecc_info *info) +{ + u64 ipid; + u32 instidhi, instidlo; + + ipid = bank->regs[ACA_REG_IDX__IPID]; + info->hwid = ACA_REG_IPID_HARDWAREID(ipid); + info->mcatype = ACA_REG_IPID_MCATYPE(ipid); + /* + * Unified DieID Format: SAASS. A:AID, S:Socket. + * Unified DieID[4:4] = InstanceId[0:0] + * Unified DieID[0:3] = InstanceIdHi[0:3] + */ + instidhi = ACA_REG_IPID_INSTANCEIDHI(ipid); + instidlo = ACA_REG_IPID_INSTANCEIDLO(ipid); + info->die_id = ((instidhi >> 2) & 0x03); + info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03); + + if ((aca_blk->blk_info->hwip == ACA_ECC_HWIP__SMU) && + (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX)) + info->xcd_id = + ((instidlo & GENMASK_ULL(31, 1)) == mmSMNAID_XCD0_MCA_SMU) ? 0 : 1; + + return 0; +} + +static bool aca_check_bank_hwip(struct aca_bank_reg *bank, enum aca_ecc_hwip type) +{ + struct ras_aca_hwip *hwip; + int hwid, mcatype; + u64 ipid; + + if (!bank || (type == ACA_ECC_HWIP__UNKNOWN)) + return false; + + hwip = &aca_hwid_mcatypes[type]; + if (!hwip->hwid) + return false; + + ipid = bank->regs[ACA_REG_IDX__IPID]; + hwid = ACA_REG_IPID_HARDWAREID(ipid); + mcatype = ACA_REG_IPID_MCATYPE(ipid); + + return hwip->hwid == hwid && hwip->mcatype == mcatype; +} + +static bool aca_match_bank_default(struct aca_block *aca_blk, void *data) +{ + return aca_check_bank_hwip((struct aca_bank_reg *)data, aca_blk->blk_info->hwip); +} + +static bool aca_match_gfx_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + u32 instlo; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + switch (instlo) { + case mmSMNAID_XCD0_MCA_SMU: + case mmSMNAID_XCD1_MCA_SMU: + case mmSMNXCD_XCD0_MCA_SMU: + return true; + default: + break; + } + + return false; +} + +static bool aca_match_sdma_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + /* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */ + static int sdma_err_codes[] = { 33, 34, 35, 36 }; + u32 instlo; + int errcode, i; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + if (instlo != mmSMNAID_AID0_MCA_SMU) + return false; + + errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); + errcode &= 0xff; + + /* Check SDMA error codes */ + for (i = 0; i < ARRAY_SIZE(sdma_err_codes); i++) { + if (errcode == sdma_err_codes[i]) + return true; + } + + return false; +} + +static bool aca_match_mmhub_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + /* reference to smu driver if header file */ + const int mmhub_err_codes[] = { + 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */ + 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */ + 10, /* CODE_UTCL2_ROUTER */ + 11, /* CODE_VML2 */ + 12, /* CODE_VML2_WALKER */ + 13, /* CODE_MMCANE */ + }; + u32 instlo; + int errcode, i; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + if (instlo != mmSMNAID_AID0_MCA_SMU) + return false; + + errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); + errcode &= 0xff; + + /* Check MMHUB error codes */ + for (i = 0; i < ARRAY_SIZE(mmhub_err_codes); i++) { + if (errcode == mmhub_err_codes[i]) + return true; + } + + return false; +} + +static bool aca_check_umc_de(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + return (ras_core->poison_supported && + ACA_REG_STATUS_VAL(mc_umc_status) && + ACA_REG_STATUS_DEFERRED(mc_umc_status)); +} + +static bool aca_check_umc_ue(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + if (aca_check_umc_de(ras_core, mc_umc_status)) + return false; + + return (ACA_REG_STATUS_VAL(mc_umc_status) && + (ACA_REG_STATUS_PCC(mc_umc_status) || + ACA_REG_STATUS_UC(mc_umc_status) || + ACA_REG_STATUS_TCC(mc_umc_status))); +} + +static bool aca_check_umc_ce(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + if (aca_check_umc_de(ras_core, mc_umc_status)) + return false; + + return (ACA_REG_STATUS_VAL(mc_umc_status) && + (ACA_REG_STATUS_CECC(mc_umc_status) || + (ACA_REG_STATUS_UECC(mc_umc_status) && + ACA_REG_STATUS_UC(mc_umc_status) == 0) || + /* Identify data parity error in replay mode */ + ((ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0x5 || + ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0xb) && + !(aca_check_umc_ue(ras_core, mc_umc_status))))); +} + +static int aca_parse_umc_bank(struct ras_core_context *ras_core, + struct aca_block *ras_blk, void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + uint32_t ext_error_code; + uint64_t status0; + + status0 = bank->regs[ACA_REG_IDX__STATUS]; + if (!ACA_REG_STATUS_VAL(status0)) + return 0; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status0); + + if (aca_check_umc_de(ras_core, status0)) + ecc->de_count = 1; + else if (aca_check_umc_ue(ras_core, status0)) + ecc->ue_count = ext_error_code ? + 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + else if (aca_check_umc_ce(ras_core, status0)) + ecc->ce_count = ext_error_code ? + 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + + return 0; +} + +static bool aca_check_bank_is_de(struct ras_core_context *ras_core, + uint64_t status) +{ + return (ACA_REG_STATUS_POISON(status) || + ACA_REG_STATUS_DEFERRED(status)); +} + +static int aca_parse_bank_default(struct ras_core_context *ras_core, + struct aca_block *ras_blk, + void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + u64 misc0 = bank->regs[ACA_REG_IDX__MISC0]; + u64 status = bank->regs[ACA_REG_IDX__STATUS]; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = status; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + if (aca_check_bank_is_de(ras_core, status)) { + ecc->de_count = 1; + } else { + if (bank->ecc_type == RAS_ERR_TYPE__UE) + ecc->ue_count = 1; + else if (bank->ecc_type == RAS_ERR_TYPE__CE) + ecc->ce_count = ACA_REG_MISC0_ERRCNT(misc0); + } + + return 0; +} + +static int aca_parse_xgmi_bank(struct ras_core_context *ras_core, + struct aca_block *ras_blk, + void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + u64 status, count; + int ext_error_code; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + status = bank->regs[ACA_REG_IDX__STATUS]; + ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status); + + count = ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + if (bank->ecc_type == RAS_ERR_TYPE__UE) { + if (ext_error_code != 0 && ext_error_code != 9) + count = 0ULL; + ecc->ue_count = count; + } else if (bank->ecc_type == RAS_ERR_TYPE__CE) { + count = ext_error_code == 6 ? count : 0ULL; + ecc->ce_count = count; + } + + return 0; +} + +static const struct aca_block_info aca_v1_0_umc = { + .name = "umc", + .ras_block_id = RAS_BLOCK_ID__UMC, + .hwip = ACA_ECC_HWIP__UMC, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK | ACA_ERROR__DE_MASK, + .bank_ops = { + .bank_match = aca_match_bank_default, + .bank_parse = aca_parse_umc_bank, + }, +}; + +static const struct aca_block_info aca_v1_0_gfx = { + .name = "gfx", + .ras_block_id = RAS_BLOCK_ID__GFX, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, + .bank_ops = { + .bank_match = aca_match_gfx_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_sdma = { + .name = "sdma", + .ras_block_id = RAS_BLOCK_ID__SDMA, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK, + .bank_ops = { + .bank_match = aca_match_sdma_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_mmhub = { + .name = "mmhub", + .ras_block_id = RAS_BLOCK_ID__MMHUB, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK, + .bank_ops = { + .bank_match = aca_match_mmhub_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_xgmi = { + .name = "xgmi", + .ras_block_id = RAS_BLOCK_ID__XGMI_WAFL, + .hwip = ACA_ECC_HWIP__PCS_XGMI, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, + .bank_ops = { + .bank_match = aca_match_bank_default, + .bank_parse = aca_parse_xgmi_bank, + }, +}; + +static const struct aca_block_info *aca_block_info_v1_0[] = { + &aca_v1_0_umc, + &aca_v1_0_gfx, + &aca_v1_0_sdma, + &aca_v1_0_mmhub, + &aca_v1_0_xgmi, +}; + +const struct ras_aca_ip_func ras_aca_func_v1_0 = { + .block_num = ARRAY_SIZE(aca_block_info_v1_0), + .block_info = aca_block_info_v1_0, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h new file mode 100644 index 000000000000..40e5d94b037f --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_ACA_V1_0_H__ +#define __RAS_ACA_V1_0_H__ +#include "ras.h" + +#define ACA__REG__FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) +#define ACA_REG_STATUS_VAL(x) ACA__REG__FIELD(x, 63, 63) +#define ACA_REG_STATUS_OVERFLOW(x) ACA__REG__FIELD(x, 62, 62) +#define ACA_REG_STATUS_UC(x) ACA__REG__FIELD(x, 61, 61) +#define ACA_REG_STATUS_EN(x) ACA__REG__FIELD(x, 60, 60) +#define ACA_REG_STATUS_MISCV(x) ACA__REG__FIELD(x, 59, 59) +#define ACA_REG_STATUS_ADDRV(x) ACA__REG__FIELD(x, 58, 58) +#define ACA_REG_STATUS_PCC(x) ACA__REG__FIELD(x, 57, 57) +#define ACA_REG_STATUS_ERRCOREIDVAL(x) ACA__REG__FIELD(x, 56, 56) +#define ACA_REG_STATUS_TCC(x) ACA__REG__FIELD(x, 55, 55) +#define ACA_REG_STATUS_SYNDV(x) ACA__REG__FIELD(x, 53, 53) +#define ACA_REG_STATUS_CECC(x) ACA__REG__FIELD(x, 46, 46) +#define ACA_REG_STATUS_UECC(x) ACA__REG__FIELD(x, 45, 45) +#define ACA_REG_STATUS_DEFERRED(x) ACA__REG__FIELD(x, 44, 44) +#define ACA_REG_STATUS_POISON(x) ACA__REG__FIELD(x, 43, 43) +#define ACA_REG_STATUS_SCRUB(x) ACA__REG__FIELD(x, 40, 40) +#define ACA_REG_STATUS_ERRCOREID(x) ACA__REG__FIELD(x, 37, 32) +#define ACA_REG_STATUS_ADDRLSB(x) ACA__REG__FIELD(x, 29, 24) +#define ACA_REG_STATUS_ERRORCODEEXT(x) ACA__REG__FIELD(x, 21, 16) +#define ACA_REG_STATUS_ERRORCODE(x) ACA__REG__FIELD(x, 15, 0) + +#define ACA_REG_IPID_MCATYPE(x) ACA__REG__FIELD(x, 63, 48) +#define ACA_REG_IPID_INSTANCEIDHI(x) ACA__REG__FIELD(x, 47, 44) +#define ACA_REG_IPID_HARDWAREID(x) ACA__REG__FIELD(x, 43, 32) +#define ACA_REG_IPID_INSTANCEIDLO(x) ACA__REG__FIELD(x, 31, 0) + +#define ACA_REG_MISC0_VALID(x) ACA__REG__FIELD(x, 63, 63) +#define ACA_REG_MISC0_OVRFLW(x) ACA__REG__FIELD(x, 48, 48) +#define ACA_REG_MISC0_ERRCNT(x) ACA__REG__FIELD(x, 43, 32) + +#define ACA_REG_SYND_ERRORINFORMATION(x) ACA__REG__FIELD(x, 17, 0) + +/* NOTE: The following codes refers to the smu header file */ +#define ACA_EXTERROR_CODE_CE 0x3a +#define ACA_EXTERROR_CODE_FAULT 0x3b + +#define mmSMNAID_XCD0_MCA_SMU 0x36430400 /* SMN AID XCD0 */ +#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */ +#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */ +#define mmSMNAID_AID0_MCA_SMU 0x03b30400 /* SMN AID AID0 */ + +extern const struct ras_aca_ip_func ras_aca_func_v1_0; +#endif -- 2.34.1