From: YiPeng Chai <[email protected]>

Add poison consumption handling for gfx v12_1.

Signed-off-by: YiPeng Chai <[email protected]>
Reviewed-by: Hawking Zhang <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c        | 44 +++++++++++++++++++
 .../drm/amd/amdkfd/kfd_int_process_v12_1.c    | 12 ++++-
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 720ed3a2c78c6..2785eda6fea52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -463,6 +463,7 @@ struct amdgpu_gfx {
        struct amdgpu_irq_src           cp_ecc_error_irq;
        struct amdgpu_irq_src           sq_irq;
        struct amdgpu_irq_src           rlc_gc_fed_irq;
+       struct amdgpu_irq_src           rlc_poison_irq;
        struct sq_work                  sq_work;
 
        /* gfx status */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
index 1a7ccfed8f0ac..db49582a211f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c
@@ -45,6 +45,7 @@
 #include "v12_structs.h"
 #include "gfx_v12_1.h"
 #include "mes_v12_1.h"
+#include "amdgpu_ras_mgr.h"
 
 #define GFX12_MEC_HPD_SIZE     2048
 #define NUM_SIMD_PER_CU_GFX12_1        4
@@ -1184,6 +1185,13 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block 
*ip_block)
        if (r)
                return r;
 
+       /* RLC POISON Error */
+       r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_RLC,
+                               GFX_12_1_0__SRCID__RLC_POISON_INTERRUPT,
+                               &adev->gfx.rlc_poison_irq);
+       if (r)
+               return r;
+
        adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
 
        r = gfx_v12_1_rlc_init(adev);
@@ -3778,6 +3786,35 @@ static int gfx_v12_1_priv_inst_irq(struct amdgpu_device 
*adev,
        return 0;
 }
 
+static int gfx_v12_1_rlc_poison_irq(struct amdgpu_device *adev,
+                                 struct amdgpu_irq_src *source,
+                                 struct amdgpu_iv_entry *entry)
+{
+       uint32_t rlc_fed_status = 0;
+       uint32_t ras_blk = RAS_BLOCK_ID__GFX;
+       struct ras_ih_info ih_info = {0};
+       int i, num_xcc;
+
+       num_xcc = NUM_XCC(adev->gfx.xcc_mask);
+       for (i = 0; i < num_xcc; i++)
+               rlc_fed_status |= RREG32(SOC15_REG_OFFSET(GC,
+                                       GET_INST(GC, i), 
regRLC_RLCS_FED_STATUS));
+
+       if (!rlc_fed_status)
+               return 0;
+
+       if (REG_GET_FIELD(rlc_fed_status, RLC_RLCS_FED_STATUS, SDMA0_FED_ERR) ||
+           REG_GET_FIELD(rlc_fed_status, RLC_RLCS_FED_STATUS, SDMA1_FED_ERR))
+               ras_blk = RAS_BLOCK_ID__SDMA;
+
+       dev_warn(adev->dev, "RLC %d FED IRQ\n", ras_blk);
+
+       ih_info.block = ras_blk;
+       ih_info.reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+       amdgpu_ras_mgr_dispatch_interrupt(adev, &ih_info);
+       return 0;
+}
+
 static void gfx_v12_1_emit_mem_sync(struct amdgpu_ring *ring)
 {
        const unsigned int gcr_cntl =
@@ -3902,6 +3939,10 @@ static const struct amdgpu_irq_src_funcs 
gfx_v12_1_priv_inst_irq_funcs = {
        .process = gfx_v12_1_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v12_1_rlc_poison_irq_funcs = {
+       .process = gfx_v12_1_rlc_poison_irq,
+};
+
 static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev)
 {
        adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
@@ -3912,6 +3953,9 @@ static void gfx_v12_1_set_irq_funcs(struct amdgpu_device 
*adev)
 
        adev->gfx.priv_inst_irq.num_types = 1;
        adev->gfx.priv_inst_irq.funcs = &gfx_v12_1_priv_inst_irq_funcs;
+
+       adev->gfx.rlc_poison_irq.num_types = 1;
+       adev->gfx.rlc_poison_irq.funcs = &gfx_v12_1_rlc_poison_irq_funcs;
 }
 
 static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c
index 5d8b5aa194da8..47947b94926ba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c
@@ -28,6 +28,7 @@
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 #include "kfd_smi_events.h"
 #include "kfd_debug.h"
+#include "amdgpu_ras_mgr.h"
 
 /*
  * GFX12.1 SQ Interrupts
@@ -185,6 +186,7 @@ static void event_interrupt_poison_consumption_v12_1(struct 
kfd_node *node,
        enum amdgpu_ras_block block = 0;
        int ret = -EINVAL;
        uint32_t reset = 0;
+       u64 event_id = RAS_EVENT_INVALID_ID;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
 
        if (!p)
@@ -220,7 +222,15 @@ static void 
event_interrupt_poison_consumption_v12_1(struct kfd_node *node,
         * resetting queue passes, do page retirement without gpu reset
         * resetting queue fails, fallback to gpu reset solution
         */
-       amdgpu_amdkfd_ras_poison_consumption_handler(node->adev, block, reset);
+       if (amdgpu_uniras_enabled(node->adev))
+               event_id = amdgpu_ras_mgr_gen_ras_event_seqno(node->adev,
+                                       RAS_SEQNO_TYPE_POISON_CONSUMPTION);
+
+       RAS_EVENT_LOG(node->adev, event_id,
+                     "poison is consumed by source %d, kick off gpu reset 
flow\n", source_id);
+
+       amdgpu_amdkfd_ras_pasid_poison_consumption_handler(node->adev,
+                               block, pasid, NULL, NULL, reset);
 }
 
 static bool event_interrupt_isr_v12_1(struct kfd_node *node,
-- 
2.53.0

Reply via email to