RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Zhang, Hawking
[AMD Official Use Only - General]

Good point, Le, will switch to the existing helper for the cross die access in 
v2.

Regards,
Hawking

-Original Message-
From: Ma, Le 
Sent: Tuesday, January 2, 2024 14:45
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Chai, Thomas ; Li, 
Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo 
Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

[AMD Official Use Only - General]

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 11:44 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ;
> Yang, Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
>
> Add ras helper function to query boot time gpu errors.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95
> +  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |
> 15 +++-
>  3 files changed, 112 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 616b6c911767..db44ec857a31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device
> *adev);  #define WREG32_FIELD_OFFSET(reg, offset, field, val) \
>   WREG32(mm##reg + offset, (RREG32(mm##reg + offset) &
> ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
>
> +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define
> +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x,
> h, l)
> +(((x) & GENMASK_ULL(h, l)) >> (l))
>  /*
>   * BIOS helpers.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 39399d0f2ce5..5f302b7693b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct
> ras_err_data *err_data,
>
>   return 0;
>  }
> +
> +#define mmMP0_SMN_C2PMSG_92  0x1609C
> +#define mmMP0_SMN_C2PMSG_126 0x160BE
> +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> +  u32 instance, u32
> +boot_error) {
> + u32 socket_id, aid_id, hbm_id;
> + u32 reg_data;
> + u64 reg_addr;
> +
> + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> +
> + if (instance)
> + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
> +AMDGPU_SMN_TARGET_AID(instance) +
> +AMDGPU_SMN_CROSS_AID;
Hi Hawking,

We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, 
maybe it could also be re-used here.

Thanks.
> + else
> + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
> +
> + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed,
> + fw
> status is 0x%x\n",
> + socket_id, aid_id, reg_data);
> +
> + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d,
> + memory
> training failed\n",
> +  socket_id, aid_id, hbm_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, firmware load
> + failed
> at boot time\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, wafl link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> memory test failed\n",
> +  socket_id, aid_id, hbm_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> + bist
> test faile

RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Ma, Le
[AMD Official Use Only - General]

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 11:44 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors
>
> Add ras helper function to query boot time gpu errors.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95
> +  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |
> 15 +++-
>  3 files changed, 112 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 616b6c911767..db44ec857a31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
>  #define WREG32_FIELD_OFFSET(reg, offset, field, val) \
>   WREG32(mm##reg + offset, (RREG32(mm##reg + offset) &
> ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
>
> +#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define
> +AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x,
> h, l)
> +(((x) & GENMASK_ULL(h, l)) >> (l))
>  /*
>   * BIOS helpers.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 39399d0f2ce5..5f302b7693b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct
> ras_err_data *err_data,
>
>   return 0;
>  }
> +
> +#define mmMP0_SMN_C2PMSG_92  0x1609C
> +#define mmMP0_SMN_C2PMSG_126 0x160BE
> +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> +  u32 instance, u32 boot_error)
> +{
> + u32 socket_id, aid_id, hbm_id;
> + u32 reg_data;
> + u64 reg_addr;
> +
> + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> +
> + if (instance)
> + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
> +AMDGPU_SMN_TARGET_AID(instance) +
> +AMDGPU_SMN_CROSS_AID;
Hi Hawking,

We have asic function "aqua_vanjaram_encode_ext_smn_addressing" for this stuff, 
maybe it could also be re-used here.

Thanks.
> + else
> + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
> +
> + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw
> status is 0x%x\n",
> + socket_id, aid_id, reg_data);
> +
> + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory
> training failed\n",
> +  socket_id, aid_id, hbm_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed
> at boot time\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, wafl link training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> memory test failed\n",
> +  socket_id, aid_id, hbm_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist
> test failed\n",
> +  socket_id, aid_id, hbm_id);
> +}
> +
> +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
> +  u32 instance, u32 *boot_error) {
> + u32 reg_addr;
> + u32 reg_data;
> + int retry_loop;
> +
> + if (instance)
> + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> +AMDGPU_SMN_TARGET_AID(instance) +
> +AMDGPU_SMN_CROSS_AID;
> + else
> +   

Re: [PATCH] drm/amd/pm: Add mem_busy_percent for GCv9.4.3 apu

2024-01-01 Thread Lazar, Lijo

On 12/22/2023 10:52 PM, Asad Kamal wrote:

Expose sysfs entry mem_busy_percent for GC version
9.4.3 APU system

Signed-off-by: Asad Kamal 


Reviewed-by: Lijo Lazar 

Thanks,
Lijo

---
  drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 2cd995b0ceba..f3cb490fe79b 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -2168,7 +2168,9 @@ static int default_attr_update(struct amdgpu_device 
*adev, struct amdgpu_device_
if (amdgpu_dpm_is_overdrive_supported(adev))
*states = ATTR_STATE_SUPPORTED;
} else if (DEVICE_ATTR_IS(mem_busy_percent)) {
-   if (adev->flags & AMD_IS_APU || gc_ver == IP_VERSION(9, 0, 1))
+   if ((adev->flags & AMD_IS_APU &&
+gc_ver != IP_VERSION(9, 4, 3)) ||
+   gc_ver == IP_VERSION(9, 0, 1))
*states = ATTR_STATE_UNSUPPORTED;
} else if (DEVICE_ATTR_IS(pcie_bw)) {
/* PCIe Perf counters won't work on APU nodes */




RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Zhang, Hawking
[AMD Official Use Only - General]

I was hoping the macro could be used for 64bit registers as well. i.e., the aca 
regs.

Regards,
Hawking

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, January 2, 2024 13:24
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Chai, 
Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

[AMD Official Use Only - General]

-Original Message-
From: Hawking Zhang 
Sent: Tuesday, January 2, 2024 11:44 AM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

Add ras helper function to query boot time gpu errors.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))

+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define
+AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l)
+(((x) & GENMASK_ULL(h, l)) >> (l))

[kevin]:
The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 
bits (in this patch),  do we need to change it to GENMASK() ? or you want to 
cover 64bit register cases..
Thanks.

Best Regards,
Kevin
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,

return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32
+boot_error) {
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id); }

RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-01 Thread Zhang, Hawking
[AMD Official Use Only - General]

The ret gives us a chance to fallback to legacy query approach (from vbios).


You might want to see patch #3 of the series for more details, go to the 
following lines in patch #3

+   /* query ras capability from psp */
+   if (amdgpu_psp_get_ras_capability(&adev->psp))
+   goto init_ras_enabled_flag;


Regards,
Hawking

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, January 2, 2024 13:19
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Chai, 
Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

[AMD Official Use Only - General]

-Original Message-
From: Hawking Zhang 
Sent: Tuesday, January 2, 2024 11:45 AM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

Instead of traditional atomfirmware interfaces for RAS capability, host driver 
can query ras capability from psp starting from psp v13_0_6.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b536e3cada..8a3847d3041f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }

+bool amdgpu_psp_get_ras_capability(struct psp_context *psp) {
+   bool ret;
+
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   ret = psp->funcs->get_ras_capability(psp);
+   return ret;
[kevin]:
This variable 'ret' seems to have no other purpose, can we remove it and return 
directly ?

Best Regards,
Kevin
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)  {
struct amdgpu_device *adev = psp->adev; diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };

 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);  int is_psp_fw_valid(struct psp_bin_desc bin);

 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"

 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }

+static bool psp_v13_0_get_ras_capability(struct psp_context *psp) {
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras

RE: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Hawking Zhang 
Sent: Tuesday, January 2, 2024 11:44 AM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

Add ras helper function to query boot time gpu errors.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))

+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32) #define
+AMDGPU_SMN_CROSS_AID (1ULL << 34) #define AMDGPU_GET_REG_FIELD(x, h, l)
+(((x) & GENMASK_ULL(h, l)) >> (l))

[kevin]:
The macro GENMASK_ULL() will return a 64bit mask value, but the register is 32 
bits (in this patch),  do we need to change it to GENMASK() ? or you want to 
cover 64bit register cases..
Thanks.

Best Regards,
Kevin
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,

return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error) {
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2

RE: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-01 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Hawking Zhang 
Sent: Tuesday, January 2, 2024 11:45 AM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH 2/3] drm/amdgpu: Query ras capablity from psp

Instead of traditional atomfirmware interfaces for RAS capability, host driver 
can query ras capability from psp starting from psp v13_0_6.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b536e3cada..8a3847d3041f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }

+bool amdgpu_psp_get_ras_capability(struct psp_context *psp) {
+   bool ret;
+
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   ret = psp->funcs->get_ras_capability(psp);
+   return ret;
[kevin]:
This variable 'ret' seems to have no other purpose, can we remove it and return 
directly ?

Best Regards,
Kevin
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)  {
struct amdgpu_device *adev = psp->adev; diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };

 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);  int is_psp_fw_valid(struct psp_bin_desc bin);

 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"

 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }

+static bool psp_v13_0_get_ras_capability(struct psp_context *psp) {
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };

 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
--
2.17.1



[PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c

2024-01-01 Thread Hawking Zhang
So kernel message has the device pcie bdf information,
which helps issue debugging especially in multiple GPU
system.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 144 
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 8a3847d3041f..0d871479ff34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -291,21 +291,22 @@ static int psp_memory_training_init(struct psp_context 
*psp)
struct psp_memory_training_context *ctx = &psp->mem_train_ctx;
 
if (ctx->init != PSP_MEM_TRAIN_RESERVE_SUCCESS) {
-   DRM_DEBUG("memory training is not supported!\n");
+   dev_dbg(psp->adev->dev, "memory training is not supported!\n");
return 0;
}
 
ctx->sys_cache = kzalloc(ctx->train_data_size, GFP_KERNEL);
if (ctx->sys_cache == NULL) {
-   DRM_ERROR("alloc mem_train_ctx.sys_cache failed!\n");
+   dev_err(psp->adev->dev, "alloc mem_train_ctx.sys_cache 
failed!\n");
ret = -ENOMEM;
goto Err_out;
}
 
-   
DRM_DEBUG("train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
- ctx->train_data_size,
- ctx->p2c_train_data_offset,
- ctx->c2p_train_data_offset);
+   dev_dbg(psp->adev->dev,
+   
"train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
+   ctx->train_data_size,
+   ctx->p2c_train_data_offset,
+   ctx->c2p_train_data_offset);
ctx->init = PSP_MEM_TRAIN_INIT_SUCCESS;
return 0;
 
@@ -407,7 +408,7 @@ static int psp_sw_init(void *handle)
 
psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
if (!psp->cmd) {
-   DRM_ERROR("Failed to allocate memory to command buffer!\n");
+   dev_err(adev->dev, "Failed to allocate memory to command 
buffer!\n");
ret = -ENOMEM;
}
 
@@ -454,13 +455,13 @@ static int psp_sw_init(void *handle)
if (mem_training_ctx->enable_mem_training) {
ret = psp_memory_training_init(psp);
if (ret) {
-   DRM_ERROR("Failed to initialize memory training!\n");
+   dev_err(adev->dev, "Failed to initialize memory 
training!\n");
return ret;
}
 
ret = psp_mem_training(psp, PSP_MEM_TRAIN_COLD_BOOT);
if (ret) {
-   DRM_ERROR("Failed to process memory training!\n");
+   dev_err(adev->dev, "Failed to process memory 
training!\n");
return ret;
}
}
@@ -675,9 +676,11 @@ psp_cmd_submit_buf(struct psp_context *psp,
 */
if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout) && 
!ras_intr) {
if (ucode)
-   DRM_WARN("failed to load ucode %s(0x%X) ",
- amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
-   DRM_WARN("psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
+   dev_warn(psp->adev->dev,
+"failed to load ucode %s(0x%X) ",
+amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
+   dev_warn(psp->adev->dev,
+"psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
 psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), 
psp->cmd_buf_mem->cmd_id,
 psp->cmd_buf_mem->resp.status);
/* If any firmware (including CAP) load fails under SRIOV, it 
should
@@ -807,7 +810,7 @@ static int psp_tmr_init(struct psp_context *psp)
psp->fw_pri_buf) {
ret = psp_load_toc(psp, &tmr_size);
if (ret) {
-   DRM_ERROR("Failed to load toc\n");
+   dev_err(psp->adev->dev, "Failed to load toc\n");
return ret;
}
}
@@ -855,7 +858,7 @@ static int psp_tmr_load(struct psp_context *psp)
 
psp_prep_tmr_cmd_buf(psp, cmd, psp->tmr_mc_addr, psp->tmr_bo);
if (psp->tmr_bo)
-   DRM_INFO("reserve 0x%lx from 0x%llx for PSP TMR\n",
+   dev_info(psp->adev->dev, "reserve 0x%lx from 0x%llx for PSP 
TMR\n",
 amdgpu_bo_size(psp->tmr_bo), psp->tmr_mc_addr);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
@@ -1113,7 +1116,7 @@ int psp_reg_program(struct psp_context *psp, enum 
psp_reg_prog_id reg,
psp_prep_reg_prog_cmd_buf(cmd, reg, value);
ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr);
if (ret)
-   DRM_ERROR(

[PATCH 2/3] Revert "drm/amdgpu: enable mca debug mode on APU by default"

2024-01-01 Thread Hawking Zhang
Not needed any more with firmware fixes

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 842405bb8995..d6e74b4dc6d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3159,8 +3159,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
-   /* enable MCA debug on APU device */
-   amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
+   amdgpu_ras_set_mca_debug_mode(adev, false);
 
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
-- 
2.17.1



[PATCH 1/3] drm/amdgpu: Packed socket_id to ras feature mask

2024-01-01 Thread Hawking Zhang
Initialize RAS feature mask bit[31:29] with socket_id.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 72b6e41329b0..842405bb8995 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2936,6 +2936,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   /* Packed socket_id to ras feature mask bits[31:29] */
+   if (adev->smuio.funcs &&
+   adev->smuio.funcs->get_socket_id)
+   con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 
29);
+
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
-- 
2.17.1



[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-01 Thread Hawking Zhang
Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5f302b7693b3..72b6e41329b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (!amdgpu_ras_asic_supported(adev))
return;
 
-   if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
-   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-   

[PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-01 Thread Hawking Zhang
Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b536e3cada..8a3847d3041f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   bool ret;
+
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   ret = psp->funcs->get_ras_capability(psp);
+   return ret;
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1



[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-01 Thread Hawking Zhang
Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1



[PATCH 0/3] Add ras cap query from psp

2024-01-01 Thread Hawking Zhang
Driver can query RAS capability through psp or bios.

Hawking Zhang (3):
  drm/amdgpu: Align ras block enum with firmware
  drm/amdgpu: Query ras capablity from psp
  drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  13 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   2 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  |  26 
 5 files changed, 136 insertions(+), 77 deletions(-)

-- 
2.17.1



[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-01 Thread Hawking Zhang
Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1



[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-01 Thread Hawking Zhang
Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1



[PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Hawking Zhang
Add ras helper function to query boot time gpu
errors.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32)
+#define AMDGPU_SMN_CROSS_AID (1ULL << 34)
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+   *boot_error = reg_data;
+   return 0;
+   }
+   msleep(1);
+   }
+
+   *boot_error = reg_data;
+   return -ETIME;
+}
+
+void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 
num_instances)
+{
+   u32 boot_error = 0;
+   u32 i;
+
+   for (i = 0; i < num_instances; i++) {
+   

[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-01 Thread Hawking Zhang
To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1



[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-01 Thread Hawking Zhang
Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 1bf975b8d083..94b536e3cada 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = &adev->psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_CP_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
- 

[PATCH 0/5] Add boot time error reporting

2024-01-01 Thread Hawking Zhang
For ASICs that support boot time error reporting, poll all
the boot time errors cached in registers and make it available
in kernel log.

Hawking Zhang (5):
  drm/amdgpu: drop psp v13 query_boot_status implementation
  drm/amdgpu: Init pcie_index/data address as fallback
  drm/amdgpu: Add ras helper to query boot errors
  drm/amdgpu: Query boot status if discovery failed
  drm/amdgpu: Query boot status if boot failed

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 22 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 15 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 95 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   | 15 ++-
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c| 89 ++---
 8 files changed, 141 insertions(+), 108 deletions(-)

-- 
2.17.1



Re: [PATCH] drm/amd/display: Fix sending VSC (+ colorimetry) packets for DP/eDP displays without PSR

2024-01-01 Thread Joshua Ashton

From the issue:

```
Thank you for for fixing this!
I built a custom kernel with this patch on the fedora rawhide kernel 
(6.7.0-0.rc8.61.fc40.x86_64) and now the colors look correct. SDR 
content is now displayed as sRGB and HDR/WCG content can use the full 
capabilities of the display.
I currently don't have a desktop mail client installed to comment on the 
mailing list directly, so I'll post it here (not sure if it counts or 
matters πŸ˜€ )


Tested-By: Simon Berz 
```

- Joshie 🐸✨

On 1/1/24 18:28, Joshua Ashton wrote:

The check for sending the vsc infopacket to the display was gated behind
PSR (Panel Self Refresh) being enabled.

The vsc infopacket also contains the colorimetry (specifically the
container color gamut) information for the stream on modern DP.

PSR is typically only supported on mobile phone eDP displays, thus this
was not getting sent for typical desktop monitors or TV screens.

This functionality is needed for proper HDR10 functionality on DP as it
wants BT2020 RGB/YCbCr for the container color space.

Signed-off-by: Joshua Ashton 

Cc: Harry Wentland 
Cc: Xaver Hugl 
Cc: Melissa Wen 
---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c   |  8 +---
  .../amd/display/modules/info_packet/info_packet.c   | 13 -
  2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 2845c884398e..6dff56408bf4 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6233,8 +6233,9 @@ create_stream_for_sink(struct drm_connector *connector,
  
  	if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)

mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket);
-
-   if (stream->link->psr_settings.psr_feature_enabled || 
stream->link->replay_settings.replay_feature_enabled) {
+   else if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT ||
+stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST ||
+stream->signal == SIGNAL_TYPE_EDP) {
//
// should decide stream support vsc sdp colorimetry capability
// before building vsc info packet
@@ -6250,8 +6251,9 @@ create_stream_for_sink(struct drm_connector *connector,
if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22)
tf = TRANSFER_FUNC_GAMMA_22;
mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, 
stream->output_color_space, tf);
-   aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
  
+		if (stream->link->psr_settings.psr_feature_enabled)

+   aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
}
  finish:
dc_sink_release(sink);
diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c 
b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
index 84f9b412a4f1..738ee763f24a 100644
--- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
+++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
@@ -147,12 +147,15 @@ void mod_build_vsc_infopacket(const struct 
dc_stream_state *stream,
}
  
  	/* VSC packet set to 4 for PSR-SU, or 2 for PSR1 */

-   if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1)
-   vsc_packet_revision = vsc_packet_rev4;
-   else if (stream->link->replay_settings.config.replay_supported)
+   if (stream->link->psr_settings.psr_feature_enabled) {
+   if (stream->link->psr_settings.psr_version == 
DC_PSR_VERSION_SU_1)
+   vsc_packet_revision = vsc_packet_rev4;
+   else if (stream->link->psr_settings.psr_version == 
DC_PSR_VERSION_1)
+   vsc_packet_revision = vsc_packet_rev2;
+   }
+
+   if (stream->link->replay_settings.config.replay_supported)
vsc_packet_revision = vsc_packet_rev4;
-   else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1)
-   vsc_packet_revision = vsc_packet_rev2;
  
  	/* Update to revision 5 for extended colorimetry support */

if (stream->use_vsc_sdp_for_colorimetry)




[PATCH] drm/amd/display: Fix sending VSC (+ colorimetry) packets for DP/eDP displays without PSR

2024-01-01 Thread Joshua Ashton
The check for sending the vsc infopacket to the display was gated behind
PSR (Panel Self Refresh) being enabled.

The vsc infopacket also contains the colorimetry (specifically the
container color gamut) information for the stream on modern DP.

PSR is typically only supported on mobile phone eDP displays, thus this
was not getting sent for typical desktop monitors or TV screens.

This functionality is needed for proper HDR10 functionality on DP as it
wants BT2020 RGB/YCbCr for the container color space.

Signed-off-by: Joshua Ashton 

Cc: Harry Wentland 
Cc: Xaver Hugl 
Cc: Melissa Wen 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c   |  8 +---
 .../amd/display/modules/info_packet/info_packet.c   | 13 -
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 2845c884398e..6dff56408bf4 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6233,8 +6233,9 @@ create_stream_for_sink(struct drm_connector *connector,
 
if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)
mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket);
-
-   if (stream->link->psr_settings.psr_feature_enabled || 
stream->link->replay_settings.replay_feature_enabled) {
+   else if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT ||
+stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST ||
+stream->signal == SIGNAL_TYPE_EDP) {
//
// should decide stream support vsc sdp colorimetry capability
// before building vsc info packet
@@ -6250,8 +6251,9 @@ create_stream_for_sink(struct drm_connector *connector,
if (stream->out_transfer_func->tf == TRANSFER_FUNCTION_GAMMA22)
tf = TRANSFER_FUNC_GAMMA_22;
mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, 
stream->output_color_space, tf);
-   aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
 
+   if (stream->link->psr_settings.psr_feature_enabled)
+   aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
}
 finish:
dc_sink_release(sink);
diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c 
b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
index 84f9b412a4f1..738ee763f24a 100644
--- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
+++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c
@@ -147,12 +147,15 @@ void mod_build_vsc_infopacket(const struct 
dc_stream_state *stream,
}
 
/* VSC packet set to 4 for PSR-SU, or 2 for PSR1 */
-   if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_SU_1)
-   vsc_packet_revision = vsc_packet_rev4;
-   else if (stream->link->replay_settings.config.replay_supported)
+   if (stream->link->psr_settings.psr_feature_enabled) {
+   if (stream->link->psr_settings.psr_version == 
DC_PSR_VERSION_SU_1)
+   vsc_packet_revision = vsc_packet_rev4;
+   else if (stream->link->psr_settings.psr_version == 
DC_PSR_VERSION_1)
+   vsc_packet_revision = vsc_packet_rev2;
+   }
+
+   if (stream->link->replay_settings.config.replay_supported)
vsc_packet_revision = vsc_packet_rev4;
-   else if (stream->link->psr_settings.psr_version == DC_PSR_VERSION_1)
-   vsc_packet_revision = vsc_packet_rev2;
 
/* Update to revision 5 for extended colorimetry support */
if (stream->use_vsc_sdp_for_colorimetry)
-- 
2.43.0