RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] Good point, Le, will switch to the existing helper for the cross die access in v2. Regards, Hawking -Original Message- From: Ma, Le Sent: Tuesday, January 2, 2024 14:45 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors [AMD Official Use Only - General] > -Original Message- > From: Hawking Zhang > Sent: Tuesday, January 2, 2024 11:44 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; > Yang, Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Zhang, Hawking ; Deucher, Alexander > ; Lazar, Lijo ; Ma, Le > > Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors > > Add ras helper function to query boot time gpu errors. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 > + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | > 15 +++- > 3 files changed, 112 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 616b6c911767..db44ec857a31 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device > *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ > WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & > ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) > > +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define > +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, > h, l) > +(((x) & GENMASK_ULL(h, l)) >> (l)) > /* > * BIOS helpers. > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 39399d0f2ce5..5f302b7693b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct > ras_err_data *err_data, > > return 0; > } > + > +#define mmMP0_SMN_C2PMSG_92 0x1609C > +#define mmMP0_SMN_C2PMSG_126 0x160BE > +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device > *adev, > + u32 instance, u32 > +boot_error) { > + u32 socket_id, aid_id, hbm_id; > + u32 reg_data; > + u64 reg_addr; > + > + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; Hi Hawking, We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, maybe it could also be re-used here. Thanks. > + else > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); > + > + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, > + fw > status is 0x%x\n", > + socket_id, aid_id, reg_data); > + > + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, > + memory > training failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, firmware load > + failed > at boot time\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, wafl link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link > + training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm > memory test failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm > + bist > test faile
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] > -Original Message- > From: Hawking Zhang > Sent: Tuesday, January 2, 2024 11:44 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, > Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Zhang, Hawking ; Deucher, Alexander > ; Lazar, Lijo ; Ma, Le > > Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors > > Add ras helper function to query boot time gpu errors. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 > + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | > 15 +++- > 3 files changed, 112 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 616b6c911767..db44ec857a31 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); > #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ > WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & > ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) > > +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define > +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, > h, l) > +(((x) & GENMASK_ULL(h, l)) >> (l)) > /* > * BIOS helpers. > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 39399d0f2ce5..5f302b7693b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct > ras_err_data *err_data, > > return 0; > } > + > +#define mmMP0_SMN_C2PMSG_92 0x1609C > +#define mmMP0_SMN_C2PMSG_126 0x160BE > +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device > *adev, > + u32 instance, u32 boot_error) > +{ > + u32 socket_id, aid_id, hbm_id; > + u32 reg_data; > + u64 reg_addr; > + > + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; Hi Hawking, We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, maybe it could also be re-used here. Thanks. > + else > + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); > + > + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw > status is 0x%x\n", > + socket_id, aid_id, reg_data); > + > + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory > training failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed > at boot time\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training > failed\n", > + socket_id, aid_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm > memory test failed\n", > + socket_id, aid_id, hbm_id); > + > + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) > + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist > test failed\n", > + socket_id, aid_id, hbm_id); > +} > + > +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, > + u32 instance, u32 *boot_error) { > + u32 reg_addr; > + u32 reg_data; > + int retry_loop; > + > + if (instance) > + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + > +AMDGPU_SMN_TARGET_AID(instance) + > +AMDGPU_SMN_CROSS_AID; > + else > +
Re: [PATCH] drm/amd/pm: Add mem_busy_percent for GCv9.4.3 apu
On 12/22/2023 10:52 PM, Asad Kamal wrote: Expose sysfs entry mem_busy_percent for GC version 9.4.3 APU system Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Thanks, Lijo --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 2cd995b0ceba..f3cb490fe79b 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2168,7 +2168,9 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ if (amdgpu_dpm_is_overdrive_supported(adev)) *states = ATTR_STATE_SUPPORTED; } else if (DEVICE_ATTR_IS(mem_busy_percent)) { - if (adev->flags & AMD_IS_APU || gc_ver == IP_VERSION(9, 0, 1)) + if ((adev->flags & AMD_IS_APU && +gc_ver != IP_VERSION(9, 4, 3)) || + gc_ver == IP_VERSION(9, 0, 1)) *states = ATTR_STATE_UNSUPPORTED; } else if (DEVICE_ATTR_IS(pcie_bw)) { /* PCIe Perf counters won't work on APU nodes */
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] I was hoping the macro could be used for 64bit registers as well. i.e., the aca regs. Regards, Hawking -Original Message- From: Wang, Yang(Kevin) Sent: Tuesday, January 2, 2024 13:24 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors [AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:44 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors Add ras helper function to query boot time gpu errors. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c911767..db44ec857a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l) +(((x) & GENMASK_ULL(h, l)) >> (l)) [kevin]: The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 bits (in this patch), do we need to change it to GENMASK() ? or you want to cover 64bit register cases.. Thanks. Best Regards, Kevin /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 39399d0f2ce5..5f302b7693b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_920x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +u32 instance, u32 +boot_error) { + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", +socket_id, aid_id, hbm_id); }
RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
[AMD Official Use Only - General] The ret gives us a chance to fallback to legacy query approach (from vbios). You might want to see patch #3 of the series for more details, go to the following lines in patch #3 + /* query ras capability from psp */ + if (amdgpu_psp_get_ras_capability(&adev->psp)) + goto init_ras_enabled_flag; Regards, Hawking -Original Message- From: Wang, Yang(Kevin) Sent: Tuesday, January 2, 2024 13:19 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp [AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:45 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp Instead of traditional atomfirmware interfaces for RAS capability, host driver can query ras capability from psp starting from psp v13_0_6. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 26 + 3 files changed, 41 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 94b536e3cada..8a3847d3041f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev) return ret; } +bool amdgpu_psp_get_ras_capability(struct psp_context *psp) { + bool ret; + + if (psp->funcs && + psp->funcs->get_ras_capability) { + ret = psp->funcs->get_ras_capability(psp); + return ret; [kevin]: This variable 'ret' seems to have no other purpose, can we remove it and return directly ? Best Regards, Kevin + } else { + return false; + } +} + static int psp_hw_start(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 09d1f8f72a9c..652b0a01854a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -134,6 +134,7 @@ struct psp_funcs { int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr); int (*vbflash_stat)(struct psp_context *psp); int (*fatal_error_recovery_quirk)(struct psp_context *psp); + bool (*get_ras_capability)(struct psp_context *psp); }; struct ta_funcs { @@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int mode); int is_psp_fw_valid(struct psp_bin_desc bin); int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev); +bool amdgpu_psp_get_ras_capability(struct psp_context *psp); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 676bec2cc157..722b6066ce07 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -27,6 +27,7 @@ #include "amdgpu_ucode.h" #include "soc15_common.h" #include "psp_v13_0.h" +#include "amdgpu_ras.h" #include "mp/mp_13_0_2_offset.h" #include "mp/mp_13_0_2_sh_mask.h" @@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp) return 0; } +static bool psp_v13_0_get_ras_capability(struct psp_context *psp) { + struct amdgpu_device *adev = psp->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + u32 reg_data; + + /* query ras cap should be done from host side */ + if (amdgpu_sriov_vf(adev)) + return false; + + if (!con) + return false; + + if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) && + (!(adev->flags & AMD_IS_APU))) { + reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127); + adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0)); + con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 24) ? true : false; + return true; + } else { + return false; + } +} + static const struct psp_funcs psp_v13_0_funcs = { .init_microcode = psp_v13_0_init_microcode, .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state, @@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = { .update_spirom = psp_v13_0_update_spirom, .vbflash_stat = psp_v13_0_vbflash_status, .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk, + .get_ras_capability = psp_v13_0_get_ras
RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
[AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:44 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors Add ras helper function to query boot time gpu errors. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c911767..db44ec857a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l) +(((x) & GENMASK_ULL(h, l)) >> (l)) [kevin]: The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 bits (in this patch), do we need to change it to GENMASK() ? or you want to cover 64bit register cases.. Thanks. Best Regards, Kevin /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 39399d0f2ce5..5f302b7693b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_920x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +u32 instance, u32 boot_error) +{ + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", +socket_id, aid_id, hbm_id); +} + +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, +u32 instance, u32 *boot_error) { + u32 reg_addr; + u32 reg_data; + int retry_loop; + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2
RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp
[AMD Official Use Only - General] -Original Message- From: Hawking Zhang Sent: Tuesday, January 2, 2024 11:45 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Wang, Yang(Kevin) ; Chai, Thomas ; Li, Candice Cc: Zhang, Hawking ; Deucher, Alexander ; Lazar, Lijo ; Ma, Le Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp Instead of traditional atomfirmware interfaces for RAS capability, host driver can query ras capability from psp starting from psp v13_0_6. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 26 + 3 files changed, 41 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 94b536e3cada..8a3847d3041f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev) return ret; } +bool amdgpu_psp_get_ras_capability(struct psp_context *psp) { + bool ret; + + if (psp->funcs && + psp->funcs->get_ras_capability) { + ret = psp->funcs->get_ras_capability(psp); + return ret; [kevin]: This variable 'ret' seems to have no other purpose, can we remove it and return directly ? Best Regards, Kevin + } else { + return false; + } +} + static int psp_hw_start(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 09d1f8f72a9c..652b0a01854a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -134,6 +134,7 @@ struct psp_funcs { int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr); int (*vbflash_stat)(struct psp_context *psp); int (*fatal_error_recovery_quirk)(struct psp_context *psp); + bool (*get_ras_capability)(struct psp_context *psp); }; struct ta_funcs { @@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int mode); int is_psp_fw_valid(struct psp_bin_desc bin); int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev); +bool amdgpu_psp_get_ras_capability(struct psp_context *psp); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 676bec2cc157..722b6066ce07 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -27,6 +27,7 @@ #include "amdgpu_ucode.h" #include "soc15_common.h" #include "psp_v13_0.h" +#include "amdgpu_ras.h" #include "mp/mp_13_0_2_offset.h" #include "mp/mp_13_0_2_sh_mask.h" @@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp) return 0; } +static bool psp_v13_0_get_ras_capability(struct psp_context *psp) { + struct amdgpu_device *adev = psp->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + u32 reg_data; + + /* query ras cap should be done from host side */ + if (amdgpu_sriov_vf(adev)) + return false; + + if (!con) + return false; + + if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) && + (!(adev->flags & AMD_IS_APU))) { + reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127); + adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0)); + con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 24) ? true : false; + return true; + } else { + return false; + } +} + static const struct psp_funcs psp_v13_0_funcs = { .init_microcode = psp_v13_0_init_microcode, .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state, @@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = { .update_spirom = psp_v13_0_update_spirom, .vbflash_stat = psp_v13_0_vbflash_status, .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk, + .get_ras_capability = psp_v13_0_get_ras_capability, }; void psp_v13_0_set_psp_funcs(struct psp_context *psp) -- 2.17.1
[PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c
So kernel message has the device pcie bdf information, which helps issue debugging especially in multiple GPU system. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 144 1 file changed, 75 insertions(+), 69 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 8a3847d3041f..0d871479ff34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -291,21 +291,22 @@ static int psp_memory_training_init(struct psp_context *psp) struct psp_memory_training_context *ctx = &psp->mem_train_ctx; if (ctx->init != PSP_MEM_TRAIN_RESERVE_SUCCESS) { - DRM_DEBUG("memory training is not supported!\n"); + dev_dbg(psp->adev->dev, "memory training is not supported!\n"); return 0; } ctx->sys_cache = kzalloc(ctx->train_data_size, GFP_KERNEL); if (ctx->sys_cache == NULL) { - DRM_ERROR("alloc mem_train_ctx.sys_cache failed!\n"); + dev_err(psp->adev->dev, "alloc mem_train_ctx.sys_cache failed!\n"); ret = -ENOMEM; goto Err_out; } - DRM_DEBUG("train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n", - ctx->train_data_size, - ctx->p2c_train_data_offset, - ctx->c2p_train_data_offset); + dev_dbg(psp->adev->dev, + "train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n", + ctx->train_data_size, + ctx->p2c_train_data_offset, + ctx->c2p_train_data_offset); ctx->init = PSP_MEM_TRAIN_INIT_SUCCESS; return 0; @@ -407,7 +408,7 @@ static int psp_sw_init(void *handle) psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); if (!psp->cmd) { - DRM_ERROR("Failed to allocate memory to command buffer!\n"); + dev_err(adev->dev, "Failed to allocate memory to command buffer!\n"); ret = -ENOMEM; } @@ -454,13 +455,13 @@ static int psp_sw_init(void *handle) if (mem_training_ctx->enable_mem_training) { ret = psp_memory_training_init(psp); if (ret) { - DRM_ERROR("Failed to initialize memory training!\n"); + dev_err(adev->dev, "Failed to initialize memory training!\n"); return ret; } ret = psp_mem_training(psp, PSP_MEM_TRAIN_COLD_BOOT); if (ret) { - DRM_ERROR("Failed to process memory training!\n"); + dev_err(adev->dev, "Failed to process memory training!\n"); return ret; } } @@ -675,9 +676,11 @@ psp_cmd_submit_buf(struct psp_context *psp, */ if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout) && !ras_intr) { if (ucode) - DRM_WARN("failed to load ucode %s(0x%X) ", - amdgpu_ucode_name(ucode->ucode_id), ucode->ucode_id); - DRM_WARN("psp gfx command %s(0x%X) failed and response status is (0x%X)\n", + dev_warn(psp->adev->dev, +"failed to load ucode %s(0x%X) ", +amdgpu_ucode_name(ucode->ucode_id), ucode->ucode_id); + dev_warn(psp->adev->dev, +"psp gfx command %s(0x%X) failed and response status is (0x%X)\n", psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), psp->cmd_buf_mem->cmd_id, psp->cmd_buf_mem->resp.status); /* If any firmware (including CAP) load fails under SRIOV, it should @@ -807,7 +810,7 @@ static int psp_tmr_init(struct psp_context *psp) psp->fw_pri_buf) { ret = psp_load_toc(psp, &tmr_size); if (ret) { - DRM_ERROR("Failed to load toc\n"); + dev_err(psp->adev->dev, "Failed to load toc\n"); return ret; } } @@ -855,7 +858,7 @@ static int psp_tmr_load(struct psp_context *psp) psp_prep_tmr_cmd_buf(psp, cmd, psp->tmr_mc_addr, psp->tmr_bo); if (psp->tmr_bo) - DRM_INFO("reserve 0x%lx from 0x%llx for PSP TMR\n", + dev_info(psp->adev->dev, "reserve 0x%lx from 0x%llx for PSP TMR\n", amdgpu_bo_size(psp->tmr_bo), psp->tmr_mc_addr); ret = psp_cmd_submit_buf(psp, NULL, cmd, @@ -1113,7 +1116,7 @@ int psp_reg_program(struct psp_context *psp, enum psp_reg_prog_id reg, psp_prep_reg_prog_cmd_buf(cmd, reg, value); ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr); if (ret) - DRM_ERROR(
[PATCH 2/3] Revert "drm/amdgpu: enable mca debug mode on APU by default"
Not needed any more with firmware fixes Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 842405bb8995..d6e74b4dc6d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3159,8 +3159,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; - /* enable MCA debug on APU device */ - amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU)); + amdgpu_ras_set_mca_debug_mode(adev, false); list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { if (!node->ras_obj) { -- 2.17.1
[PATCH 1/3] drm/amdgpu: Packed socket_id to ras feature mask
Initialize RAS feature mask bit[31:29] with socket_id. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 72b6e41329b0..842405bb8995 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2936,6 +2936,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + /* Packed socket_id to ras feature mask bits[31:29] */ + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id) + con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 29); + /* Get RAS schema for particular SOC */ con->schema = amdgpu_get_ras_schema(adev); -- 2.17.1
[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
Move ras capablity check to amdgpu_ras_check_supported. Driver will query ras capablity through psp interace, or vbios interface, or specific ip callbacks. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- 1 file changed, 93 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5f302b7693b3..72b6e41329b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -39,6 +39,7 @@ #include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" +#include "amdgpu_psp.h" #ifdef CONFIG_X86_MCE_AMD #include @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } +/* Query ras capablity via atomfirmware interface */ +static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) +{ + /* mem_ecc cap */ + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { + dev_info(adev->dev, "MEM ECC is active.\n"); + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | +1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "MEM ECC is not presented.\n"); + } + + /* sram_ecc cap */ + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { + dev_info(adev->dev, "SRAM ECC is active.\n"); + if (!amdgpu_sriov_vf(adev)) + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + else + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | +1 << AMDGPU_RAS_BLOCK__SDMA | +1 << AMDGPU_RAS_BLOCK__GFX); + + /* +* VCN/JPEG RAS can be supported on both bare metal and +* SRIOV environment +*/ + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | +1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + + /* +* XGMI RAS is not supported if xgmi num physical nodes +* is zero +*/ + if (!adev->gmc.xgmi.num_physical_nodes) + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); + } else { + dev_info(adev->dev, "SRAM ECC is not presented.\n"); + } +} + +/* Query poison mode from umc/df IP callbacks */ +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool df_poison, umc_poison; + + /* poison setting is useless on SRIOV guest */ + if (amdgpu_sriov_vf(adev) || !con) + return; + + /* Init poison supported flag, the default value is false */ + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) { + /* enabled by default when GPU is connected to CPU */ + con->poison_supported = true; + } else if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras->query_ras_poison_mode(adev); + + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, + "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!amdgpu_ras_asic_supported(adev)) return; - if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -
[PATCH 2/3] drm/amdgpu: Query ras capablity from psp
Instead of traditional atomfirmware interfaces for RAS capability, host driver can query ras capability from psp starting from psp v13_0_6. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 + drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 ++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 26 + 3 files changed, 41 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 94b536e3cada..8a3847d3041f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev) return ret; } +bool amdgpu_psp_get_ras_capability(struct psp_context *psp) +{ + bool ret; + + if (psp->funcs && + psp->funcs->get_ras_capability) { + ret = psp->funcs->get_ras_capability(psp); + return ret; + } else { + return false; + } +} + static int psp_hw_start(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 09d1f8f72a9c..652b0a01854a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -134,6 +134,7 @@ struct psp_funcs { int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr); int (*vbflash_stat)(struct psp_context *psp); int (*fatal_error_recovery_quirk)(struct psp_context *psp); + bool (*get_ras_capability)(struct psp_context *psp); }; struct ta_funcs { @@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int mode); int is_psp_fw_valid(struct psp_bin_desc bin); int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev); +bool amdgpu_psp_get_ras_capability(struct psp_context *psp); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 676bec2cc157..722b6066ce07 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -27,6 +27,7 @@ #include "amdgpu_ucode.h" #include "soc15_common.h" #include "psp_v13_0.h" +#include "amdgpu_ras.h" #include "mp/mp_13_0_2_offset.h" #include "mp/mp_13_0_2_sh_mask.h" @@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp) return 0; } +static bool psp_v13_0_get_ras_capability(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + u32 reg_data; + + /* query ras cap should be done from host side */ + if (amdgpu_sriov_vf(adev)) + return false; + + if (!con) + return false; + + if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) && + (!(adev->flags & AMD_IS_APU))) { + reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127); + adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0)); + con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 24) ? true : false; + return true; + } else { + return false; + } +} + static const struct psp_funcs psp_v13_0_funcs = { .init_microcode = psp_v13_0_init_microcode, .wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state, @@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = { .update_spirom = psp_v13_0_update_spirom, .vbflash_stat = psp_v13_0_vbflash_status, .fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk, + .get_ras_capability = psp_v13_0_get_ras_capability, }; void psp_v13_0_set_psp_funcs(struct psp_context *psp) -- 2.17.1
[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware
Driver and firmware share the same ras block enum. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 5785b705c692..8b053602c5ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -70,6 +70,8 @@ enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__MCA, AMDGPU_RAS_BLOCK__VCN, AMDGPU_RAS_BLOCK__JPEG, + AMDGPU_RAS_BLOCK__IH, + AMDGPU_RAS_BLOCK__MPIO, AMDGPU_RAS_BLOCK__LAST }; -- 2.17.1
[PATCH 0/3] Add ras cap query from psp
Driver can query RAS capability through psp or bios. Hawking Zhang (3): drm/amdgpu: Align ras block enum with firmware drm/amdgpu: Query ras capablity from psp drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 + drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 26 5 files changed, 136 insertions(+), 77 deletions(-) -- 2.17.1
[PATCH 5/5] drm/amdgpu: Query boot status if boot failed
Check and report firmware boot status if it doesn't reach steady status. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 6fad451a85be..676bec2cc157 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context *psp) static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; + int ret; if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) { - psp_v13_0_wait_for_vmbx_ready(psp); + ret = psp_v13_0_wait_for_vmbx_ready(psp); + if (ret) + amdgpu_ras_query_boot_status(adev, 4); + + ret = psp_v13_0_wait_for_bootloader(psp); + if (ret) + amdgpu_ras_query_boot_status(adev, 4); - return psp_v13_0_wait_for_bootloader(psp); + return ret; } return 0; -- 2.17.1
[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed
Check and report boot status if discovery failed. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index b8fde08aec8e..302b71e9f1e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -27,6 +27,7 @@ #include "amdgpu_discovery.h" #include "soc15_hw_ip.h" #include "discovery.h" +#include "amdgpu_ras.h" #include "soc15.h" #include "gfx_v9_0.h" @@ -98,6 +99,7 @@ #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin" MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY); +#define mmIP_DISCOVERY_VERSION 0x16A00 #define mmRCC_CONFIG_MEMSIZE 0xde3 #define mmMP0_SMN_C2PMSG_330x16061 #define mmMM_INDEX 0x0 @@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) out: kfree(adev->mman.discovery_bin); adev->mman.discovery_bin = NULL; - + if ((amdgpu_discovery != 2) && + (RREG32(mmIP_DISCOVERY_VERSION) == 4)) + amdgpu_ras_query_boot_status(adev, 4); return r; } -- 2.17.1
[PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
Add ras helper function to query boot time gpu errors. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c911767..db44ec857a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define WREG32_FIELD_OFFSET(reg, offset, field, val) \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field)) +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) +#define AMDGPU_SMN_CROSS_AID (1ULL << 34) +#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l)) /* * BIOS helpers. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 39399d0f2ce5..5f302b7693b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, return 0; } + +#define mmMP0_SMN_C2PMSG_920x1609C +#define mmMP0_SMN_C2PMSG_126 0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +u32 instance, u32 boot_error) +{ + u32 socket_id, aid_id, hbm_id; + u32 reg_data; + u64 reg_addr; + + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2); + + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", + socket_id, aid_id, reg_data); + + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +socket_id, aid_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +socket_id, aid_id, hbm_id); + + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", +socket_id, aid_id, hbm_id); +} + +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, +u32 instance, u32 *boot_error) +{ + u32 reg_addr; + u32 reg_data; + int retry_loop; + + if (instance) + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + + AMDGPU_SMN_TARGET_AID(instance) + + AMDGPU_SMN_CROSS_AID; + else + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2); + + for (retry_loop = 0; retry_loop < 1000; retry_loop++) { + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { + *boot_error = reg_data; + return 0; + } + msleep(1); + } + + *boot_error = reg_data; + return -ETIME; +} + +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) +{ + u32 boot_error = 0; + u32 i; + + for (i = 0; i < num_instances; i++) { +
[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback
To allow using this helper for indirect access when nbio funcs is not available. For instance, in ip discovery phase. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 001a35fa0f19..873419a5b9aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, void __iomem *pcie_index_hi_offset; void __iomem *pcie_data_offset; - pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); - pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); - if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) - pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); - else + if (unlikely(!adev->nbio.funcs)) { + pcie_index = (0x38 >> 2); + pcie_data = (0x3C >> 2); + } else { + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + } + + if (reg_addr >> 32) { + if (unlikely(!adev->nbio.funcs)) + pcie_index_hi = (0x44 >> 2); + else + pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + } else { pcie_index_hi = 0; + } spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; -- 2.17.1
[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation
Will replace it with new implementation to cover boot fails in ip discovery phase. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 - drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 - drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h| 4 -- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 -- 4 files changed, 99 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4b694696930e..001a35fa0f19 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev) amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { amdgpu_psp_wait_for_bootloader(adev); ret = amdgpu_atomfirmware_asic_init(adev, true); - /* TODO: check the return val and stop device initialization if boot fails */ - amdgpu_psp_query_boot_status(adev); return ret; } else { return amdgpu_atom_asic_init(adev->mode_info.atom_context); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 1bf975b8d083..94b536e3cada 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev) return ret; } -int amdgpu_psp_query_boot_status(struct amdgpu_device *adev) -{ - struct psp_context *psp = &adev->psp; - int ret = 0; - - if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU)) - return 0; - - if (psp->funcs && - psp->funcs->query_boot_status) - ret = psp->funcs->query_boot_status(psp); - - return ret; -} - static int psp_hw_start(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index c4d9cbde55b9..09d1f8f72a9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -134,7 +134,6 @@ struct psp_funcs { int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr); int (*vbflash_stat)(struct psp_context *psp); int (*fatal_error_recovery_quirk)(struct psp_context *psp); - int (*query_boot_status)(struct psp_context *psp); }; struct ta_funcs { @@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int mode); int is_psp_fw_valid(struct psp_bin_desc bin); int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev); - -int amdgpu_psp_query_boot_status(struct amdgpu_device *adev); - #endif diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index df1844d0800f..6fad451a85be 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct psp_context *psp) return 0; } - -static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev, - uint32_t inst, - uint32_t boot_error) -{ - uint32_t socket_id; - uint32_t aid_id; - uint32_t hbm_id; - uint32_t reg_data; - - socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID); - aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID); - hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID); - - reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109); - dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", -socket_id, aid_id, reg_data); - - if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING)) - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", -socket_id, aid_id, hbm_id); - - if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD)) - dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", -socket_id, aid_id); - - if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_WAFL_LINK_TRAINING)) - dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", -socket_id, aid_id); - - if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_XGMI_LINK_TRAINING)) - dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", -socket_id, aid_id); - - if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_USR_CP_LINK_TRAINING)) - dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", -
[PATCH 0/5] Add boot time error reporting
For ASICs that support boot time error reporting, poll all the boot time errors cached in registers and make it available in kernel log. Hawking Zhang (5): drm/amdgpu: drop psp v13 query_boot_status implementation drm/amdgpu: Init pcie_index/data address as fallback drm/amdgpu: Add ras helper to query boot errors drm/amdgpu: Query boot status if discovery failed drm/amdgpu: Query boot status if boot failed drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 22 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 15 --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 4 - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 ++- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c| 89 ++--- 8 files changed, 141 insertions(+), 108 deletions(-) -- 2.17.1
Re: [PATCH] drm/amd/display: Fix sending VSC (+ colorimetry) packets for DP/eDP displays without PSR
From the issue: ``` Thank you for for fixing this! I built a custom kernel with this patch on the fedora rawhide kernel (6.7.0-0.rc8.61.fc40.x86_64) and now the colors look correct. SDR content is now displayed as sRGB and HDR/WCG content can use the full capabilities of the display. I currently don't have a desktop mail client installed to comment on the mailing list directly, so I'll post it here (not sure if it counts or matters π ) Tested-By: Simon Berz ``` - Joshie πΈβ¨ On 1/1/24 18:28, Joshua Ashton wrote: The check for sending the vsc infopacket to the display was gated behind PSR (Panel Self Refresh) being enabled. The vsc infopacket also contains the colorimetry (specifically the container color gamut) information for the stream on modern DP. PSR is typically only supported on mobile phone eDP displays, thus this was not getting sent for typical desktop monitors or TV screens. This functionality is needed for proper HDR10 functionality on DP as it wants BT2020 RGB/YCbCr for the container color space. Signed-off-by: Joshua Ashton Cc: Harry Wentland Cc: Xaver Hugl Cc: Melissa Wen --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +--- .../amd/display/modules/info_packet/info_packet.c | 13 - 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2845c884398e..6dff56408bf4 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6233,8 +6233,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); - - if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) { + else if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || +stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || +stream->signal == SIGNAL_TYPE_EDP) { // // should decide stream support vsc sdp colorimetry capability // before building vsc info packet @@ -6250,8 +6251,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) tf = TRANSFER_FUNC_GAMMA_22; mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf); - aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY; + if (stream->link->psr_settings.psr_feature_enabled) + aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY; } finish: dc_sink_release(sink); diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c index 84f9b412a4f1..738ee763f24a 100644 --- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c +++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c @@ -147,12 +147,15 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream, } /* VSC packet set to 4 for PSR-SU, or 2 for PSR1 */ - if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) - vsc_packet_revision = vsc_packet_rev4; - else if (stream->link->replay_settings.config.replay_supported) + if (stream->link->psr_settings.psr_feature_enabled) { + if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) + vsc_packet_revision = vsc_packet_rev4; + else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) + vsc_packet_revision = vsc_packet_rev2; + } + + if (stream->link->replay_settings.config.replay_supported) vsc_packet_revision = vsc_packet_rev4; - else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) - vsc_packet_revision = vsc_packet_rev2; /* Update to revision 5 for extended colorimetry support */ if (stream->use_vsc_sdp_for_colorimetry)
[PATCH] drm/amd/display: Fix sending VSC (+ colorimetry) packets for DP/eDP displays without PSR
The check for sending the vsc infopacket to the display was gated behind PSR (Panel Self Refresh) being enabled. The vsc infopacket also contains the colorimetry (specifically the container color gamut) information for the stream on modern DP. PSR is typically only supported on mobile phone eDP displays, thus this was not getting sent for typical desktop monitors or TV screens. This functionality is needed for proper HDR10 functionality on DP as it wants BT2020 RGB/YCbCr for the container color space. Signed-off-by: Joshua Ashton Cc: Harry Wentland Cc: Xaver Hugl Cc: Melissa Wen --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +--- .../amd/display/modules/info_packet/info_packet.c | 13 - 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2845c884398e..6dff56408bf4 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6233,8 +6233,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); - - if (stream->link->psr_settings.psr_feature_enabled || stream->link->replay_settings.replay_feature_enabled) { + else if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || +stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || +stream->signal == SIGNAL_TYPE_EDP) { // // should decide stream support vsc sdp colorimetry capability // before building vsc info packet @@ -6250,8 +6251,9 @@ create_stream_for_sink(struct drm_connector *connector, if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22) tf = TRANSFER_FUNC_GAMMA_22; mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf); - aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY; + if (stream->link->psr_settings.psr_feature_enabled) + aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY; } finish: dc_sink_release(sink); diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c index 84f9b412a4f1..738ee763f24a 100644 --- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c +++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c @@ -147,12 +147,15 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream, } /* VSC packet set to 4 for PSR-SU, or 2 for PSR1 */ - if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) - vsc_packet_revision = vsc_packet_rev4; - else if (stream->link->replay_settings.config.replay_supported) + if (stream->link->psr_settings.psr_feature_enabled) { + if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) + vsc_packet_revision = vsc_packet_rev4; + else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) + vsc_packet_revision = vsc_packet_rev2; + } + + if (stream->link->replay_settings.config.replay_supported) vsc_packet_revision = vsc_packet_rev4; - else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1) - vsc_packet_revision = vsc_packet_rev2; /* Update to revision 5 for extended colorimetry support */ if (stream->use_vsc_sdp_for_colorimetry) -- 2.43.0