RE: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3
[AMD Official Use Only - General] Reviewed-by: Tao Zhou > -Original Message- > From: Zhang, Hawking > Sent: Thursday, March 23, 2023 10:24 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, > Stanley ; Li, Candice ; Chai, > Thomas > Cc: Zhang, Hawking > Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3 > > GPU will stop working once fatal error is detected. > it will inform driver to do reset to recover from the fatal error. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 > drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 79 + > drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h | 1 + > drivers/gpu/drm/amd/amdgpu/soc21.c | 15 - > 4 files changed, 105 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index c6dc3cd2a9de..5b1779021881 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -34,6 +34,7 @@ > #include "amdgpu_atomfirmware.h" > #include "amdgpu_xgmi.h" > #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > +#include "nbio_v4_3.h" > #include "atom.h" > #include "amdgpu_reset.h" > > @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > if (!adev->gmc.xgmi.connected_to_cpu) > adev->nbio.ras = _v7_4_ras; > break; > + case IP_VERSION(4, 3, 0): > + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF) > + /* unlike other generation of nbio ras, > + * nbio v4_3 only support fatal error interrupt > + * to inform software that DF is freezed due to > + * system fatal error event. driver should not > + * enable nbio ras in such case. Instead, > + * check DF RAS */ > + adev->nbio.ras = _v4_3_ras; > + break; > default: > /* nbio ras is not available */ > break; > diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > index 09fdcd20cb91..d5ed9e0e1a5f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c > @@ -26,6 +26,7 @@ > > #include "nbio/nbio_4_3_0_offset.h" > #include "nbio/nbio_4_3_0_sh_mask.h" > +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > #include > > static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ - > 538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = { > .remap_hdp_registers = nbio_v4_3_remap_hdp_registers, > .get_rom_offset = nbio_v4_3_get_rom_offset, }; > + > +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device > *adev, > +struct amdgpu_irq_src > *src, > +unsigned type, > +enum > amdgpu_interrupt_state state) { > + /* The ras_controller_irq enablement should be done in psp bl when it > + * tries to enable ras feature. Driver only need to set the correct > interrupt > + * vector for bare-metal and sriov use case respectively > + */ > + uint32_t bif_doorbell_int_cntl; > + > + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, > regBIF_BX0_BIF_DOORBELL_INT_CNTL); > + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, > + BIF_BX0_BIF_DOORBELL_INT_CNTL, > + > RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, > + (state == > AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); > + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, > +bif_doorbell_int_cntl); > + > + return 0; > +} > + > +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device > *adev, > + struct amdgpu_irq_src > *source, > + struct amdgpu_iv_entry *entry) > +{ > + /* By design, the ih cookie for err_event_athub_irq should be written > + * to bif ring. since bif ring is not enabled, just leave process > callback > + * as a dummy one. > + */ > + return 0; > +} > + > +static const struct amdgpu_irq_src_funcs > nbio_v4_3_ras_err_event_athub_irq_funcs = { > + .set = nbio_v4_3_set_ras_err_event_athub_irq_state, > + .process = nbio
RE: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3
[Public] Reviewed-by: Candice Li Thanks, Candice -Original Message- From: Zhang, Hawking Sent: Thursday, March 23, 2023 10:24 AM To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, Stanley ; Li, Candice ; Chai, Thomas Cc: Zhang, Hawking Subject: [PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3 GPU will stop working once fatal error is detected. it will inform driver to do reset to recover from the fatal error. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 79 + drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h | 1 + drivers/gpu/drm/amd/amdgpu/soc21.c | 15 - 4 files changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c6dc3cd2a9de..5b1779021881 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "nbio_v4_3.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) adev->nbio.ras = _v7_4_ras; break; + case IP_VERSION(4, 3, 0): + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF) + /* unlike other generation of nbio ras, +* nbio v4_3 only support fatal error interrupt +* to inform software that DF is freezed due to +* system fatal error event. driver should not +* enable nbio ras in such case. Instead, +* check DF RAS */ + adev->nbio.ras = _v4_3_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 09fdcd20cb91..d5ed9e0e1a5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -26,6 +26,7 @@ #include "nbio/nbio_4_3_0_offset.h" #include "nbio/nbio_4_3_0_sh_mask.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = { .remap_hdp_registers = nbio_v4_3_remap_hdp_registers, .get_rom_offset = nbio_v4_3_get_rom_offset, }; + +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* The ras_controller_irq enablement should be done in psp bl when it +* tries to enable ras feature. Driver only need to set the correct interrupt +* vector for bare-metal and sriov use case respectively +*/ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, + (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); + + return 0; +} + +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev, +struct amdgpu_irq_src *source, +struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written +* to bif ring. since bif ring is not enabled, just leave process callback +* as a dummy one. +*/ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v4_3_ras_err_event_athub_irq_funcs = { + .set = nbio_v4_3_set_ras_err_event_athub_irq_state, + .process = nbio_v4_3_process_err_event_athub_irq, +}; + +static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + if (REG_GET_FIELD(bif_doorbell_int_cntl, + BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTER
[PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3
GPU will stop working once fatal error is detected. it will inform driver to do reset to recover from the fatal error. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 79 + drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h | 1 + drivers/gpu/drm/amd/amdgpu/soc21.c | 15 - 4 files changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c6dc3cd2a9de..5b1779021881 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "nbio_v4_3.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) adev->nbio.ras = _v7_4_ras; break; + case IP_VERSION(4, 3, 0): + if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF) + /* unlike other generation of nbio ras, +* nbio v4_3 only support fatal error interrupt +* to inform software that DF is freezed due to +* system fatal error event. driver should not +* enable nbio ras in such case. Instead, +* check DF RAS */ + adev->nbio.ras = _v4_3_ras; + break; default: /* nbio ras is not available */ break; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 09fdcd20cb91..d5ed9e0e1a5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -26,6 +26,7 @@ #include "nbio/nbio_4_3_0_offset.h" #include "nbio/nbio_4_3_0_sh_mask.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) @@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = { .remap_hdp_registers = nbio_v4_3_remap_hdp_registers, .get_rom_offset = nbio_v4_3_get_rom_offset, }; + +static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned type, + enum amdgpu_interrupt_state state) +{ + /* The ras_controller_irq enablement should be done in psp bl when it +* tries to enable ras feature. Driver only need to set the correct interrupt +* vector for bare-metal and sriov use case respectively +*/ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE, + (state == AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); + + return 0; +} + +static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev, +struct amdgpu_irq_src *source, +struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written +* to bif ring. since bif ring is not enabled, just leave process callback +* as a dummy one. +*/ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v4_3_ras_err_event_athub_irq_funcs = { + .set = nbio_v4_3_set_ras_err_event_athub_irq_state, + .process = nbio_v4_3_process_err_event_athub_irq, +}; + +static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_device *adev) +{ + uint32_t bif_doorbell_int_cntl; + + bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + if (REG_GET_FIELD(bif_doorbell_int_cntl, + BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl, + BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_int_cntl); +