GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 ++++
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/soc21.c      | 15 ++++-
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6dc3cd2a9de..5b1779021881 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,7 @@
 #include "amdgpu_atomfirmware.h"
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "nbio_v4_3.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                if (!adev->gmc.xgmi.connected_to_cpu)
                        adev->nbio.ras = &nbio_v7_4_ras;
                break;
+       case IP_VERSION(4, 3, 0):
+               if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
+                       /* unlike other generation of nbio ras,
+                        * nbio v4_3 only support fatal error interrupt
+                        * to inform software that DF is freezed due to
+                        * system fatal error event. driver should not
+                        * enable nbio ras in such case. Instead,
+                        * check DF RAS */
+                       adev->nbio.ras = &nbio_v4_3_ras;
+               break;
        default:
                /* nbio ras is not available */
                break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 09fdcd20cb91..d5ed9e0e1a5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -26,6 +26,7 @@
 
 #include "nbio/nbio_4_3_0_offset.h"
 #include "nbio/nbio_4_3_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include <uapi/linux/kfd_ioctl.h>
 
 static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
        .remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
        .get_rom_offset = nbio_v4_3_get_rom_offset,
 };
+
+static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+                                                      struct amdgpu_irq_src 
*src,
+                                                      unsigned type,
+                                                      enum 
amdgpu_interrupt_state state)
+{
+       /* The ras_controller_irq enablement should be done in psp bl when it
+        * tries to enable ras feature. Driver only need to set the correct 
interrupt
+        * vector for bare-metal and sriov use case respectively
+        */
+       uint32_t bif_doorbell_int_cntl;
+
+       bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+       bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+                                             BIF_BX0_BIF_DOORBELL_INT_CNTL,
+                                             
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+                                             (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+       WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+       return 0;
+}
+
+static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
+                                                struct amdgpu_irq_src *source,
+                                                struct amdgpu_iv_entry *entry)
+{
+       /* By design, the ih cookie for err_event_athub_irq should be written
+        * to bif ring. since bif ring is not enabled, just leave process 
callback
+        * as a dummy one.
+        */
+       return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbio_v4_3_ras_err_event_athub_irq_funcs = {
+       .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
+       .process = nbio_v4_3_process_err_event_athub_irq,
+};
+
+static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+       uint32_t bif_doorbell_int_cntl;
+
+       bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+       if (REG_GET_FIELD(bif_doorbell_int_cntl,
+                         BIF_DOORBELL_INT_CNTL,
+                         RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+               /* driver has to clear the interrupt status when bif ring is 
disabled */
+               bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+                                               BIF_DOORBELL_INT_CNTL,
+                                               
RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+               WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+               amdgpu_ras_global_ras_isr(adev);
+       }
+}
+
+static int nbio_v4_3_init_ras_err_event_athub_interrupt(struct amdgpu_device 
*adev)
+{
+
+       int r;
+
+       /* init the irq funcs */
+       adev->nbio.ras_err_event_athub_irq.funcs =
+               &nbio_v4_3_ras_err_event_athub_irq_funcs;
+       adev->nbio.ras_err_event_athub_irq.num_types = 1;
+
+       /* register ras err event athub interrupt
+        * nbio v4_3 uses the same irq source as nbio v7_4 */
+       r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
+                             NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
+                             &adev->nbio.ras_err_event_athub_irq);
+
+       return r;
+}
+
+struct amdgpu_nbio_ras nbio_v4_3_ras = {
+       .handle_ras_err_event_athub_intr_no_bifring = 
nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring,
+       .init_ras_err_event_athub_interrupt = 
nbio_v4_3_init_ras_err_event_athub_interrupt,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
index 711999ceedf4..399037cdf4fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h
@@ -29,5 +29,6 @@
 extern const struct nbio_hdp_flush_reg nbio_v4_3_hdp_flush_reg;
 extern const struct amdgpu_nbio_funcs nbio_v4_3_funcs;
 extern const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs;
+extern struct amdgpu_nbio_ras nbio_v4_3_ras;
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 67580761b44d..514bfc705d5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -754,6 +754,14 @@ static int soc21_common_late_init(void *handle)
                                                             
sriov_vcn_4_0_0_video_codecs_decode_array_vcn0,
                                                             
ARRAY_SIZE(sriov_vcn_4_0_0_video_codecs_decode_array_vcn0));
                }
+       } else {
+               if (adev->nbio.ras &&
+                   adev->nbio.ras_err_event_athub_irq.funcs)
+                       /* don't need to fail gpu late init
+                        * if enabling athub_err_event interrupt failed
+                        * nbio v4_3 only support fatal error hanlding
+                        * just enable the interrupt directly */
+                       amdgpu_irq_get(adev, 
&adev->nbio.ras_err_event_athub_irq, 0);
        }
 
        return 0;
@@ -801,8 +809,13 @@ static int soc21_common_hw_fini(void *handle)
        /* disable the doorbell aperture */
        soc21_enable_doorbell_aperture(adev, false);
 
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
                xgpu_nv_mailbox_put_irq(adev);
+       } else {
+               if (adev->nbio.ras &&
+                   adev->nbio.ras_err_event_athub_irq.funcs)
+                       amdgpu_irq_put(adev, 
&adev->nbio.ras_err_event_athub_irq, 0);
+       }
 
        return 0;
 }
-- 
2.17.1

Reply via email to