RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] Good point, Le, will switch to the existing helper for the cross die access in v2. Regards, Hawking -Original Message- From: Ma, Le Sent: Tuesday, January 2, 2024 14:45 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors [AMD Official Use Only - General] > -Original Message- > From: Hawking Zhang > Sent: Tuesday, January 2, 2024 11:44 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; > Yang, Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Zhang, Hawking ; Deucher, Alexander > ; Lazar, Lijo ; Ma, Le > > Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors > > Add ras helper function to query boot time gpu errors. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 > + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | > 15 +++- > 3 files changed, 112 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 616b6c911767..db44ec857a31 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device > *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ > WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & > ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) > > +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define > +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, > h, l) > +(((x) & GENMASK_ULL(h, l)) >> (l)) > /* > * BIOS helpers. > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 39399d0f2ce5..5f302b7693b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct > ras_err_data *err_data, > > return 0; > } > + > +#define mmMP0_SMN_C2PMSG_92 0x1609C > +#define mmMP0_SMN_C2PMSG_126 0x160BE > +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device > *adev, > + u32 instance, u32 > +boot_error) { > + u32 socket_id, aid_id, hbm_id; > + u32 reg_data; > + u64 reg_addr; > + > + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; Hi Hawking, We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, maybe it could also be re-used here. Thanks. > + else > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); > + > + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, > + fw > status is 0x%x\n", > + socket_id, aid_id, reg_data); > + > + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, > + memory > training failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, firmware load > + failed > at boot time\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, wafl link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, &
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] > -Original Message- > From: Hawking Zhang > Sent: Tuesday, January 2, 2024 11:44 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, > Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Zhang, Hawking ; Deucher, Alexander > ; Lazar, Lijo ; Ma, Le > > Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors > > Add ras helper function to query boot time gpu errors. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 > + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | > 15 +++- > 3 files changed, 112 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 616b6c911767..db44ec857a31 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); > #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ > WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & > ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) > > +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define > +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, > h, l) > +(((x) & GENMASK_ULL(h, l)) >> (l)) > /* > * BIOS helpers. > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 39399d0f2ce5..5f302b7693b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct > ras_err_data *err_data, > > return 0; > } > + > +#define mmMP0_SMN_C2PMSG_92 0x1609C > +#define mmMP0_SMN_C2PMSG_126 0x160BE > +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device > *adev, > + u32 instance, u32 boot_error) > +{ > + u32 socket_id, aid_id, hbm_id; > + u32 reg_data; > + u64 reg_addr; > + > + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; Hi Hawking, We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, maybe it could also be re-used here. Thanks. > + else > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); > + > + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw > status is 0x%x\n", > + socket_id, aid_id, reg_data); > + > + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory > training failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed > at boot time\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm > memory test failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist > test failed\n", > + socket_id, aid_id, hbm_id); > +} > + > +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, > + u32 instance, u32 *boot_error) { > + u32 reg_addr; > + u32 reg_data; > + int retry_loop; > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; > + else > +
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] I was hoping the macro could be used for 64bit registers as well. i.e., the aca regs. Regards, Hawking -Original Message- From: Wang, Yang(Kevin) Sent: Tuesday, January 2, 2024 13:24 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors [AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:44 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors Add ras helper function to query boot time gpu errors. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c911767..db44ec857a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l) +(((x) & GENMASK_ULL(h, l)) >> (l)) [kevin]: The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 bits (in this patch), do we need to change it to GENMASK() ? or you want to cover 64bit register cases.. Thanks. Best Regards, Kevin /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 39399d0f2ce5..5f302b7693b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_920x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +u32 instance, u32 +boot_error) { + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_err
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:44 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors Add ras helper function to query boot time gpu errors. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c911767..db44ec857a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l) +(((x) & GENMASK_ULL(h, l)) >> (l)) [kevin]: The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 bits (in this patch), do we need to change it to GENMASK() ? or you want to cover 64bit register cases.. Thanks. Best Regards, Kevin /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 39399d0f2ce5..5f302b7693b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_920x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +u32 instance, u32 boot_error) +{ + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", +socket_id, aid_id, hbm_id); +} + +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, +u32 instance, u32 *boot_error) { + u32 reg_addr; + u32 reg_data; + int retry_loop; + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_126 <<