From: YiPeng Chai <[email protected]> Add poison consumption handling for gfx v12_1.
Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 44 +++++++++++++++++++ .../drm/amd/amdkfd/kfd_int_process_v12_1.c | 12 ++++- 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 720ed3a2c78c6..2785eda6fea52 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -463,6 +463,7 @@ struct amdgpu_gfx { struct amdgpu_irq_src cp_ecc_error_irq; struct amdgpu_irq_src sq_irq; struct amdgpu_irq_src rlc_gc_fed_irq; + struct amdgpu_irq_src rlc_poison_irq; struct sq_work sq_work; /* gfx status */ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c index 1a7ccfed8f0ac..db49582a211f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c @@ -45,6 +45,7 @@ #include "v12_structs.h" #include "gfx_v12_1.h" #include "mes_v12_1.h" +#include "amdgpu_ras_mgr.h" #define GFX12_MEC_HPD_SIZE 2048 #define NUM_SIMD_PER_CU_GFX12_1 4 @@ -1184,6 +1185,13 @@ static int gfx_v12_1_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + /* RLC POISON Error */ + r = amdgpu_irq_add_id(adev, SOC_V1_0_IH_CLIENTID_RLC, + GFX_12_1_0__SRCID__RLC_POISON_INTERRUPT, + &adev->gfx.rlc_poison_irq); + if (r) + return r; + adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE; r = gfx_v12_1_rlc_init(adev); @@ -3778,6 +3786,35 @@ static int gfx_v12_1_priv_inst_irq(struct amdgpu_device *adev, return 0; } +static int gfx_v12_1_rlc_poison_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + uint32_t rlc_fed_status = 0; + uint32_t ras_blk = RAS_BLOCK_ID__GFX; + struct ras_ih_info ih_info = {0}; + int i, num_xcc; + + num_xcc = NUM_XCC(adev->gfx.xcc_mask); + for (i = 0; i < num_xcc; i++) + rlc_fed_status |= RREG32(SOC15_REG_OFFSET(GC, + GET_INST(GC, i), regRLC_RLCS_FED_STATUS)); + + if (!rlc_fed_status) + return 0; + + if (REG_GET_FIELD(rlc_fed_status, RLC_RLCS_FED_STATUS, SDMA0_FED_ERR) || + REG_GET_FIELD(rlc_fed_status, RLC_RLCS_FED_STATUS, SDMA1_FED_ERR)) + ras_blk = RAS_BLOCK_ID__SDMA; + + dev_warn(adev->dev, "RLC %d FED IRQ\n", ras_blk); + + ih_info.block = ras_blk; + ih_info.reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + amdgpu_ras_mgr_dispatch_interrupt(adev, &ih_info); + return 0; +} + static void gfx_v12_1_emit_mem_sync(struct amdgpu_ring *ring) { const unsigned int gcr_cntl = @@ -3902,6 +3939,10 @@ static const struct amdgpu_irq_src_funcs gfx_v12_1_priv_inst_irq_funcs = { .process = gfx_v12_1_priv_inst_irq, }; +static const struct amdgpu_irq_src_funcs gfx_v12_1_rlc_poison_irq_funcs = { + .process = gfx_v12_1_rlc_poison_irq, +}; + static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev) { adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST; @@ -3912,6 +3953,9 @@ static void gfx_v12_1_set_irq_funcs(struct amdgpu_device *adev) adev->gfx.priv_inst_irq.num_types = 1; adev->gfx.priv_inst_irq.funcs = &gfx_v12_1_priv_inst_irq_funcs; + + adev->gfx.rlc_poison_irq.num_types = 1; + adev->gfx.rlc_poison_irq.funcs = &gfx_v12_1_rlc_poison_irq_funcs; } static void gfx_v12_1_set_imu_funcs(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c index 5d8b5aa194da8..47947b94926ba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v12_1.c @@ -28,6 +28,7 @@ #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h" #include "kfd_smi_events.h" #include "kfd_debug.h" +#include "amdgpu_ras_mgr.h" /* * GFX12.1 SQ Interrupts @@ -185,6 +186,7 @@ static void event_interrupt_poison_consumption_v12_1(struct kfd_node *node, enum amdgpu_ras_block block = 0; int ret = -EINVAL; uint32_t reset = 0; + u64 event_id = RAS_EVENT_INVALID_ID; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) @@ -220,7 +222,15 @@ static void event_interrupt_poison_consumption_v12_1(struct kfd_node *node, * resetting queue passes, do page retirement without gpu reset * resetting queue fails, fallback to gpu reset solution */ - amdgpu_amdkfd_ras_poison_consumption_handler(node->adev, block, reset); + if (amdgpu_uniras_enabled(node->adev)) + event_id = amdgpu_ras_mgr_gen_ras_event_seqno(node->adev, + RAS_SEQNO_TYPE_POISON_CONSUMPTION); + + RAS_EVENT_LOG(node->adev, event_id, + "poison is consumed by source %d, kick off gpu reset flow\n", source_id); + + amdgpu_amdkfd_ras_pasid_poison_consumption_handler(node->adev, + block, pasid, NULL, NULL, reset); } static bool event_interrupt_isr_v12_1(struct kfd_node *node, -- 2.53.0
