RE: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

2023-03-22 Thread Zhou1, Tao
[AMD Official Use Only - General]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Zhang, Hawking 
> Sent: Thursday, March 23, 2023 10:24 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang,
> Stanley ; Li, Candice ; Chai,
> Thomas 
> Cc: Zhang, Hawking 
> Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3
> 
> GPU will stop working once fatal error is detected.
> it will inform driver to do reset to recover from the fatal error.
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 
> drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +
> drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
>  drivers/gpu/drm/amd/amdgpu/soc21.c  | 15 -
>  4 files changed, 105 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c6dc3cd2a9de..5b1779021881 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -34,6 +34,7 @@
>  #include "amdgpu_atomfirmware.h"
>  #include "amdgpu_xgmi.h"
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> +#include "nbio_v4_3.h"
>  #include "atom.h"
>  #include "amdgpu_reset.h"
> 
> @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   if (!adev->gmc.xgmi.connected_to_cpu)
>   adev->nbio.ras = _v7_4_ras;
>   break;
> + case IP_VERSION(4, 3, 0):
> + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
> + /* unlike other generation of nbio ras,
> +  * nbio v4_3 only support fatal error interrupt
> +  * to inform software that DF is freezed due to
> +  * system fatal error event. driver should not
> +  * enable nbio ras in such case. Instead,
> +  * check DF RAS */
> + adev->nbio.ras = _v4_3_ras;
> + break;
>   default:
>   /* nbio ras is not available */
>   break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> index 09fdcd20cb91..d5ed9e0e1a5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
> @@ -26,6 +26,7 @@
> 
>  #include "nbio/nbio_4_3_0_offset.h"
>  #include "nbio/nbio_4_3_0_sh_mask.h"
> +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>  #include 
> 
>  static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ -
> 538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
>   .remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
>   .get_rom_offset = nbio_v4_3_get_rom_offset,  };
> +
> +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device
> *adev,
> +struct amdgpu_irq_src 
> *src,
> +unsigned type,
> +enum
> amdgpu_interrupt_state state) {
> + /* The ras_controller_irq enablement should be done in psp bl when it
> +  * tries to enable ras feature. Driver only need to set the correct
> interrupt
> +  * vector for bare-metal and sriov use case respectively
> +  */
> + uint32_t bif_doorbell_int_cntl;
> +
> + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0,
> regBIF_BX0_BIF_DOORBELL_INT_CNTL);
> + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
> +   BIF_BX0_BIF_DOORBELL_INT_CNTL,
> +
> RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
> +   (state ==
> AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
> + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL,
> +bif_doorbell_int_cntl);
> +
> + return 0;
> +}
> +
> +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device
> *adev,
> +  struct amdgpu_irq_src
> *source,
> +  struct amdgpu_iv_entry *entry)
> +{
> + /* By design, the ih cookie for err_event_athub_irq should be written
> +  * to bif ring. since bif ring is not enabled, just leave process 
> callback
> +  * as a dummy one.
> +  */
> + return 0;
> +}
> +
> +static const struct amdgpu_irq_src_funcs
> nbio_v4_3_ras_err_event_athub_irq_funcs = {
> + .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
> + .process = nbio

RE: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

2023-03-22 Thread Li, Candice
[Public]

Reviewed-by: Candice Li 



Thanks,
Candice

-Original Message-
From: Zhang, Hawking  
Sent: Thursday, March 23, 2023 10:24 AM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Li, Candice ; Chai, Thomas 

Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/soc21.c  | 15 -
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6dc3cd2a9de..5b1779021881 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,7 @@
 #include "amdgpu_atomfirmware.h"
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "nbio_v4_3.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
break;
+   case IP_VERSION(4, 3, 0):
+   if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
+   /* unlike other generation of nbio ras,
+* nbio v4_3 only support fatal error interrupt
+* to inform software that DF is freezed due to
+* system fatal error event. driver should not
+* enable nbio ras in such case. Instead,
+* check DF RAS */
+   adev->nbio.ras = _v4_3_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 09fdcd20cb91..d5ed9e0e1a5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -26,6 +26,7 @@
 
 #include "nbio/nbio_4_3_0_offset.h"
 #include "nbio/nbio_4_3_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include 
 
 static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
 };
+
+static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+  struct amdgpu_irq_src 
*src,
+  unsigned type,
+  enum 
amdgpu_interrupt_state state)
+{
+   /* The ras_controller_irq enablement should be done in psp bl when it
+* tries to enable ras feature. Driver only need to set the correct 
interrupt
+* vector for bare-metal and sriov use case respectively
+*/
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ 
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+ (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+   return 0;
+}
+
+static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
+struct amdgpu_irq_src *source,
+struct amdgpu_iv_entry *entry)
+{
+   /* By design, the ih cookie for err_event_athub_irq should be written
+* to bif ring. since bif ring is not enabled, just leave process 
callback
+* as a dummy one.
+*/
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbio_v4_3_ras_err_event_athub_irq_funcs = {
+   .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
+   .process = nbio_v4_3_process_err_event_athub_irq,
+};
+
+static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(bif_doorbell_int_cntl,
+ BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTER

[PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

2023-03-22 Thread Hawking Zhang
GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/soc21.c  | 15 -
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6dc3cd2a9de..5b1779021881 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,7 @@
 #include "amdgpu_atomfirmware.h"
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "nbio_v4_3.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
break;
+   case IP_VERSION(4, 3, 0):
+   if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
+   /* unlike other generation of nbio ras,
+* nbio v4_3 only support fatal error interrupt
+* to inform software that DF is freezed due to
+* system fatal error event. driver should not
+* enable nbio ras in such case. Instead,
+* check DF RAS */
+   adev->nbio.ras = _v4_3_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 09fdcd20cb91..d5ed9e0e1a5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -26,6 +26,7 @@
 
 #include "nbio/nbio_4_3_0_offset.h"
 #include "nbio/nbio_4_3_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include 
 
 static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
 };
+
+static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+  struct amdgpu_irq_src 
*src,
+  unsigned type,
+  enum 
amdgpu_interrupt_state state)
+{
+   /* The ras_controller_irq enablement should be done in psp bl when it
+* tries to enable ras feature. Driver only need to set the correct 
interrupt
+* vector for bare-metal and sriov use case respectively
+*/
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ 
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+ (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+   return 0;
+}
+
+static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
+struct amdgpu_irq_src *source,
+struct amdgpu_iv_entry *entry)
+{
+   /* By design, the ih cookie for err_event_athub_irq should be written
+* to bif ring. since bif ring is not enabled, just leave process 
callback
+* as a dummy one.
+*/
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbio_v4_3_ras_err_event_athub_irq_funcs = {
+   .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
+   .process = nbio_v4_3_process_err_event_athub_irq,
+};
+
+static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(bif_doorbell_int_cntl,
+ BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+   BIF_DOORBELL_INT_CNTL,
+   
RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+