RE: [PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-13 Thread Zhang, Hawking
[AMD Official Use Only - General]

RE - The judgement condition should be changed to ip_versions[][].

Thanks for catching that. It was caused by code rebase. I'll fix it

Regards,
Hawking

-Original Message-
From: Yang, Stanley  
Sent: Monday, March 13, 2023 14:12
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Li, Candice ; Chai, Thomas 

Subject: RE: [PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

[AMD Official Use Only - General]

Without the inline comments, the series looks fine to me.

Reviewed-by: Stanley.Yang 

Regards,
Stanley
> -Original Message-
> From: Zhang, Hawking 
> Sent: Monday, March 13, 2023 9:44 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; 
> Yang, Stanley ; Li, Candice 
> ; Chai, Thomas 
> Cc: Zhang, Hawking 
> Subject: [PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init
> 
> pcie_bif ras blocks needs to be initialized as early as possible to 
> handle fatal error detected in hw_init phase. also align the pcie_bif 
> ras sw_init with other ras blocks
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23
> +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 + 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 17 ++---
>  3 files changed, 34 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> index 37d779b8e4a6..a3bc00577a7c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> @@ -22,6 +22,29 @@
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
> 
> +int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev) {
> + int err;
> + struct amdgpu_nbio_ras *ras;
> +
> + if (!adev->nbio.ras)
> + return 0;
> +
> + ras = adev->nbio.ras;
> + err = amdgpu_ras_register_ras_block(adev, >ras_block);
> + if (err) {
> + dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
> + return err;
> + }
> +
> + strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
> + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
> + ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->nbio.ras_if = >ras_block.ras_comm;
> +
> + return 0;
> +}
> +
>  int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct 
> ras_common_if *ras_block)  {
>   int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> index a240336bbc6b..c686ff4bcc39 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> @@ -106,5 +106,6 @@ struct amdgpu_nbio {
>   struct amdgpu_nbio_ras  *ras;
>  };
> 
> +int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
>  int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct 
> ras_common_if *ras_block);  #endif diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 63dfcc98152d..834092099bff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2555,20 +2555,23 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
>* ras functions so hardware fatal error interrupt
>* can be enabled as early as possible */
>   switch (adev->asic_type) {

[Stanley]: The judgement condition should be changed to ip_versions[][].

> - case CHIP_VEGA20:
> - case CHIP_ARCTURUS:
> - case CHIP_ALDEBARAN:
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> + case IP_VERSION(7, 4, 0):
> + case IP_VERSION(7, 4, 1):
> + case IP_VERSION(7, 4, 4):
> + if (!adev->gmc.xgmi.connected_to_cpu)
>   adev->nbio.ras = _v7_4_ras;
> - amdgpu_ras_register_ras_block(adev, 
> >nbio.ras->ras_block);
> - adev->nbio.ras_if = >nbio.ras-
> >ras_block.ras_comm;
> - }
>   break;
>   default:
>   /* nbio ras is not available */
>   break;
>   }
> 
> + /* nbio ras block needs to be enabled ahead of other ras blocks
> +  * to handle fatal error */
> + r = amdgpu_nbio_ras_sw_init(adev);
> + if (r)
> + return r;
> +
>   if (adev->nbio.ras &&
>   adev->nbio.ras->init_ras_controller_interrupt) {
>   r = adev->nbio.ras->init_ras_controller_interrupt(adev);
> --
> 2.17.1


RE: [PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-13 Thread Yang, Stanley
[AMD Official Use Only - General]

Without the inline comments, the series looks fine to me.

Reviewed-by: Stanley.Yang 

Regards,
Stanley
> -Original Message-
> From: Zhang, Hawking 
> Sent: Monday, March 13, 2023 9:44 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ;
> Yang, Stanley ; Li, Candice ;
> Chai, Thomas 
> Cc: Zhang, Hawking 
> Subject: [PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init
> 
> pcie_bif ras blocks needs to be initialized as early as possible to handle 
> fatal
> error detected in hw_init phase. also align the pcie_bif ras sw_init with 
> other
> ras blocks
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23
> +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 17 ++---
>  3 files changed, 34 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> index 37d779b8e4a6..a3bc00577a7c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> @@ -22,6 +22,29 @@
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
> 
> +int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev) {
> + int err;
> + struct amdgpu_nbio_ras *ras;
> +
> + if (!adev->nbio.ras)
> + return 0;
> +
> + ras = adev->nbio.ras;
> + err = amdgpu_ras_register_ras_block(adev, >ras_block);
> + if (err) {
> + dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
> + return err;
> + }
> +
> + strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
> + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
> + ras->ras_block.ras_comm.type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->nbio.ras_if = >ras_block.ras_comm;
> +
> + return 0;
> +}
> +
>  int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct
> ras_common_if *ras_block)  {
>   int r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> index a240336bbc6b..c686ff4bcc39 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
> @@ -106,5 +106,6 @@ struct amdgpu_nbio {
>   struct amdgpu_nbio_ras  *ras;
>  };
> 
> +int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
>  int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct
> ras_common_if *ras_block);  #endif diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 63dfcc98152d..834092099bff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2555,20 +2555,23 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
>* ras functions so hardware fatal error interrupt
>* can be enabled as early as possible */
>   switch (adev->asic_type) {

[Stanley]: The judgement condition should be changed to ip_versions[][].

> - case CHIP_VEGA20:
> - case CHIP_ARCTURUS:
> - case CHIP_ALDEBARAN:
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> + case IP_VERSION(7, 4, 0):
> + case IP_VERSION(7, 4, 1):
> + case IP_VERSION(7, 4, 4):
> + if (!adev->gmc.xgmi.connected_to_cpu)
>   adev->nbio.ras = _v7_4_ras;
> - amdgpu_ras_register_ras_block(adev, 
> >nbio.ras->ras_block);
> - adev->nbio.ras_if = >nbio.ras-
> >ras_block.ras_comm;
> - }
>   break;
>   default:
>   /* nbio ras is not available */
>   break;
>   }
> 
> + /* nbio ras block needs to be enabled ahead of other ras blocks
> +  * to handle fatal error */
> + r = amdgpu_nbio_ras_sw_init(adev);
> + if (r)
> + return r;
> +
>   if (adev->nbio.ras &&
>   adev->nbio.ras->init_ras_controller_interrupt) {
>   r = adev->nbio.ras->init_ras_controller_interrupt(adev);
> --
> 2.17.1


[PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-12 Thread Hawking Zhang
pcie_bif ras blocks needs to be initialized as early
as possible to handle fatal error detected in hw_init
phase. also align the pcie_bif ras sw_init with other
ras blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 17 ++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 37d779b8e4a6..a3bc00577a7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -22,6 +22,29 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_nbio_ras *ras;
+
+   if (!adev->nbio.ras)
+   return 0;
+
+   ras = adev->nbio.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->nbio.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
+
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a240336bbc6b..c686ff4bcc39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -106,5 +106,6 @@ struct amdgpu_nbio {
struct amdgpu_nbio_ras  *ras;
 };
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 63dfcc98152d..834092099bff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2555,20 +2555,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * ras functions so hardware fatal error interrupt
 * can be enabled as early as possible */
switch (adev->asic_type) {
-   case CHIP_VEGA20:
-   case CHIP_ARCTURUS:
-   case CHIP_ALDEBARAN:
-   if (!adev->gmc.xgmi.connected_to_cpu) {
+   case IP_VERSION(7, 4, 0):
+   case IP_VERSION(7, 4, 1):
+   case IP_VERSION(7, 4, 4):
+   if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
-   amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
-   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
-   }
break;
default:
/* nbio ras is not available */
break;
}
 
+   /* nbio ras block needs to be enabled ahead of other ras blocks
+* to handle fatal error */
+   r = amdgpu_nbio_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
r = adev->nbio.ras->init_ras_controller_interrupt(adev);
-- 
2.17.1