RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, January 2, 2024 10:16 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Zhang, Hawking ; Deucher, Alexander
> ; Lazar, Lijo ; Ma, Le
> 
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or 
> specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
>  1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a901b00d4949..2ee82baaf7d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
>  #include "nbio_v7_9.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
>  #ifdef CONFIG_X86_MCE_AMD
>  #include 
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> + /* mem_ecc cap */
> + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> + dev_info(adev->dev, "MEM ECC is active.\n");
> + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> +  1 << AMDGPU_RAS_BLOCK__DF);
> + } else {
> + dev_info(adev->dev, "MEM ECC is not presented.\n");
> + }
> +
> + /* sram_ecc cap */
> + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> + dev_info(adev->dev, "SRAM ECC is active.\n");
> + if (!amdgpu_sriov_vf(adev))
> + adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +   1 <<
> AMDGPU_RAS_BLOCK__DF);
> + else
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +  1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +  1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /*
> +  * VCN/JPEG RAS can be supported on both bare metal and
> +  * SRIOV environment
> +  */
> + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +  1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + else
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +   1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> + /*
> +  * XGMI RAS is not supported if xgmi num physical nodes
> +  * is zero
> +  */
> + if (!adev->gmc.xgmi.num_physical_nodes)
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> + } else {
> + dev_info(adev->dev, "SRAM ECC is not presented.\n");
> + }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + bool df_poison, umc_poison;
> +
> + /* poison setting is useless on SRIOV guest */
> + if (amdgpu_sriov_vf(adev) || !con)
> + return;
> +
> + /* Init poison supported flag, the default value is false */
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + /* enabled by default when GPU is connected to CPU */
> + con->poison_supported = true;
> + } else if (adev->df.funcs &&
> + adev->df.funcs->query_ras_poison_mode &&
> + adev->umc.ras &&
> +  

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Hawking Zhang
Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a901b00d4949..2ee82baaf7d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (!amdgpu_ras_asic_supported(adev))
return;
 
-   if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
-   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-   

RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Zhou1, Tao
[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: amd-gfx  On Behalf Of Hawking
> Zhang
> Sent: Tuesday, January 2, 2024 11:45 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Wang, Yang(Kevin)
> ; Chai, Thomas ; Li,
> Candice 
> Cc: Deucher, Alexander ; Ma, Le
> ; Lazar, Lijo ; Zhang, Hawking
> 
> Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to
> amdgpu_ras_check_supported
>
> Move ras capablity check to amdgpu_ras_check_supported.
> Driver will query ras capablity through psp interace, or vbios interface, or 
> specific
> ip callbacks.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
>  1 file changed, 93 insertions(+), 77 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5f302b7693b3..72b6e41329b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -39,6 +39,7 @@
>  #include "nbio_v7_9.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_psp.h"
>
>  #ifdef CONFIG_X86_MCE_AMD
>  #include 
> @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct
> amdgpu_device *adev)
>   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  }
>
> +/* Query ras capablity via atomfirmware interface */ static void
> +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) {
> + /* mem_ecc cap */
> + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
> + dev_info(adev->dev, "MEM ECC is active.\n");
> + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
> +  1 << AMDGPU_RAS_BLOCK__DF);
> + } else {
> + dev_info(adev->dev, "MEM ECC is not presented.\n");
> + }
> +
> + /* sram_ecc cap */
> + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
> + dev_info(adev->dev, "SRAM ECC is active.\n");
> + if (!amdgpu_sriov_vf(adev))
> + adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +   1 <<
> AMDGPU_RAS_BLOCK__DF);
> + else
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +  1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +  1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +
> + /*
> +  * VCN/JPEG RAS can be supported on both bare metal and
> +  * SRIOV environment
> +  */
> + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 0) ||
> + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0,
> 3))
> + adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +  1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> + else
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +   1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +
> + /*
> +  * XGMI RAS is not supported if xgmi num physical nodes
> +  * is zero
> +  */
> + if (!adev->gmc.xgmi.num_physical_nodes)
> + adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__XGMI_WAFL);
> + } else {
> + dev_info(adev->dev, "SRAM ECC is not presented.\n");
> + }
> +}
> +
> +/* Query poison mode from umc/df IP callbacks */ static void
> +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + bool df_poison, umc_poison;
> +
> + /* poison setting is useless on SRIOV guest */
> + if (amdgpu_sriov_vf(adev) || !con)
> + return;
> +
> + /* Init poison supported flag, the default value is false */
> + if (adev->gmc.xgmi.connected_to_cpu ||
> + adev->gmc.is_app_apu) {
> + /* enabled by default when GPU is connected to CPU */
> + con->poison_supported = true;
> + } else if (adev->df.funcs &&
> + adev->df.funcs->query_ras_poison_mode &&
> + adev->umc.ras &

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-01 Thread Hawking Zhang
Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5f302b7693b3..72b6e41329b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (!amdgpu_ras_asic_supported(adev))
return;
 
-   if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
-   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-