cp_ecc_error_irq is acquired in amdgpu_gfx_ras_late_init() but released in gfx_v9_0_hw_fini(), so the put site has to query amdgpu_irq_enabled() because the get is skipped on SR-IOV VF.
ras_late_init / ras_fini have no suspend counterpart, so move the put to amdgpu_gfx_ras_suspend() / amdgpu_gfx_ras_fini() and add a matching ras_suspend callback that is invoked from amdgpu_ras_suspend() before disable_all_features(). The get and put now sit in the same place and check the same condition (not VF, funcs registered), no refcount querying needed. An active flag gates ras_fini so the suspend-then-unload-without-resume path falls into amdgpu_ras_block_late_fini_default() instead of double-releasing what ras_suspend already cleaned up. Drop the cp_ecc_error_irq put from gfx_v9_0_hw_fini(). gfx_v8_0 manages cp_ecc_error_irq locally and is unaffected; no other GFX generation has this IRQ. Signed-off-by: Yunxiang Li <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 26 ++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 -- 5 files changed, 53 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 515cc4a2aeb4d..1e190fb54a977 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -983,38 +983,50 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r if (!amdgpu_persistent_edc_harvesting_supported(adev)) { r = amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX); if (r) return r; } r = amdgpu_ras_block_late_init(adev, ras_block); if (r) return r; - if (amdgpu_sriov_vf(adev)) - return r; - - if (adev->gfx.cp_ecc_error_irq.funcs) { + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) { r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); if (r) goto late_fini; } } else { amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); } return 0; late_fini: amdgpu_ras_block_late_fini(adev, ras_block); return r; } +void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) + amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); +} + +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) + amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); + amdgpu_ras_block_late_fini(adev, ras_block); +} + int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev) { int err = 0; struct amdgpu_gfx_ras *ras = NULL; /* adev->gfx.ras is NULL, which means gfx does not * support ras function, then do nothing here. */ if (!adev->gfx.ras) return 0; @@ -1029,20 +1041,26 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev) strcpy(ras->ras_block.ras_comm.name, "gfx"); ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX; ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; adev->gfx.ras_if = &ras->ras_block.ras_comm; /* If not define special ras_late_init function, use gfx default ras_late_init */ if (!ras->ras_block.ras_late_init) ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init; + if (!ras->ras_block.ras_suspend) + ras->ras_block.ras_suspend = amdgpu_gfx_ras_suspend; + + if (!ras->ras_block.ras_fini) + ras->ras_block.ras_fini = amdgpu_gfx_ras_fini; + /* If not defined special ras_cb function, use default ras_cb */ if (!ras->ras_block.ras_cb) ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb; return 0; } int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 77050f9884f20..54c1eb9c499ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -608,21 +608,22 @@ bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring); bool amdgpu_gfx_is_high_priority_graphics_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring); bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me, int pipe, int queue); void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); void amdgpu_gfx_off_ctrl_immediate(struct amdgpu_device *adev, bool enable); int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value); int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_gfx_ras_fini(struct amdgpu_device *adev); +void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, struct ras_common_if *ras_block); +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value); int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *residency); int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value); int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry); int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t xcc_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c38af6d3599ed..bc91a5d0b9075 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -88,20 +88,23 @@ const char *ras_mca_block_string[] = { "mca_mp1", "mca_mpio", "mca_iohc", }; struct amdgpu_ras_block_list { /* ras block link */ struct list_head node; struct amdgpu_ras_block_object *ras_obj; + + /* set by ras_late_init, cleared by ras_suspend/ras_fini */ + bool active; }; const char *get_ras_block_str(struct ras_common_if *ras_block) { if (!ras_block) return "NULL"; if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || ras_block->block >= ARRAY_SIZE(ras_block_string)) return "OUT OF RANGE"; @@ -4607,24 +4610,37 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) /* there should be no any reference. */ WARN_ON(alive_obj(obj)); } } } } void amdgpu_ras_suspend(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_block_list *node; + struct amdgpu_ras_block_object *obj; if (!adev->ras_enabled || !con) return; + /* run per-block ras_suspend before tearing down the RAS context */ + list_for_each_entry(node, &adev->ras_list, node) { + if (!node->active) + continue; + + obj = node->ras_obj; + if (obj && obj->ras_suspend) + obj->ras_suspend(adev, &obj->ras_comm); + node->active = false; + } + amdgpu_ras_disable_all_features(adev, 0); /* Make sure all ras objects are disabled. */ if (AMDGPU_RAS_GET_FEATURES(con->features)) amdgpu_ras_disable_all_features(adev, 1); } int amdgpu_ras_late_init(struct amdgpu_device *adev) { struct amdgpu_ras_block_list *node, *tmp; struct amdgpu_ras_block_object *obj; @@ -4664,22 +4680,29 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) continue; if (obj->ras_late_init) { r = obj->ras_late_init(adev, &obj->ras_comm); if (r) { dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", obj->ras_comm.name, r); return r; } - } else - amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + } else { + r = amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + if (r) { + dev_err(adev->dev, "%s failed to execute ras_block_late_init_default! ret:%d\n", + obj->ras_comm.name, r); + return r; + } + } + node->active = true; } amdgpu_ras_check_bad_page_status(adev); return 0; } /* do some fini work before IP fini as dependence */ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { @@ -4704,25 +4727,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (!adev->ras_enabled || !con) return 0; amdgpu_ras_critical_region_fini(adev); mutex_destroy(&con->critical_region_lock); list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { if (ras_node->ras_obj) { obj = ras_node->ras_obj; - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && - obj->ras_fini) + /* fall back to default cleanup if ras_suspend already ran */ + if (ras_node->active && obj->ras_fini) obj->ras_fini(adev, &obj->ras_comm); else amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); + ras_node->active = false; } /* Clear ras blocks from ras_list and free ras block list node */ list_del(&ras_node->node); kfree(ras_node); } amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ff44190d7d98e..a86ab65aa2f07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -755,20 +755,21 @@ struct ras_debug_if { }; int op; }; struct amdgpu_ras_block_object { struct ras_common_if ras_comm; int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); + void (*ras_suspend)(struct amdgpu_device *adev, struct ras_common_if *ras_block); void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block); ras_ih_cb ras_cb; const struct amdgpu_ras_block_hw_ops *hw_ops; }; struct amdgpu_ras_block_hw_ops { int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if, uint32_t instance_mask); void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index e5a3735d98342..bec0720f70552 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4050,22 +4050,20 @@ static int gfx_v9_0_hw_init(struct amdgpu_ip_block *ip_block) !amdgpu_sriov_vf(adev)) gfx_v9_4_2_set_power_brake_sequence(adev); return r; } static int gfx_v9_0_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - if (amdgpu_irq_enabled(adev, &adev->gfx.cp_ecc_error_irq, 0)) - amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); if (amdgpu_irq_enabled(adev, &adev->gfx.priv_reg_irq, 0)) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); if (amdgpu_irq_enabled(adev, &adev->gfx.priv_inst_irq, 0)) amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); if (amdgpu_irq_enabled(adev, &adev->gfx.bad_op_irq, 0)) amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); /* DF freeze and kcq disable will fail */ if (!amdgpu_ras_intr_triggered()) /* disable KCQ to avoid CPC touch memory not valid anymore */ base-commit: e08f39913bfa3e8c6a61fc30ee3870d7d156c19f prerequisite-patch-id: 61de2a725d655efda7db2b8f3e3c2f80f22c7e3e -- 2.51.2
