cp_ecc_error_irq is acquired in amdgpu_gfx_ras_late_init() but
released in gfx_v9_0_hw_fini(), so the put site has to query
amdgpu_irq_enabled() because the get is skipped on SR-IOV VF.

ras_late_init / ras_fini have no suspend counterpart, so move the
put to amdgpu_gfx_ras_suspend() / amdgpu_gfx_ras_fini() and add a
matching ras_suspend callback that is invoked from
amdgpu_ras_suspend() before disable_all_features().  The get and
put now sit in the same place and check the same condition (not
VF, funcs registered), no refcount querying needed.

An active flag gates ras_fini so the
suspend-then-unload-without-resume path falls into
amdgpu_ras_block_late_fini_default() instead of double-releasing
what ras_suspend already cleaned up.

Drop the cp_ecc_error_irq put from gfx_v9_0_hw_fini().  gfx_v8_0
manages cp_ecc_error_irq locally and is unaffected; no other GFX
generation has this IRQ.

Signed-off-by: Yunxiang Li <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 26 ++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
 5 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 515cc4a2aeb4d..1e190fb54a977 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -983,38 +983,50 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
                if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
                        r = amdgpu_ras_reset_error_status(adev, 
AMDGPU_RAS_BLOCK__GFX);
                        if (r)
                                return r;
                }
 
                r = amdgpu_ras_block_late_init(adev, ras_block);
                if (r)
                        return r;
 
-               if (amdgpu_sriov_vf(adev))
-                       return r;
-
-               if (adev->gfx.cp_ecc_error_irq.funcs) {
+               if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs) 
{
                        r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 
0);
                        if (r)
                                goto late_fini;
                }
        } else {
                amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
        }
 
        return 0;
 late_fini:
        amdgpu_ras_block_late_fini(adev, ras_block);
        return r;
 }
 
+void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev,
+                           struct ras_common_if *ras_block)
+{
+       if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
+}
+
+void amdgpu_gfx_ras_fini(struct amdgpu_device *adev,
+                        struct ras_common_if *ras_block)
+{
+       if (!amdgpu_sriov_vf(adev) && adev->gfx.cp_ecc_error_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
+       amdgpu_ras_block_late_fini(adev, ras_block);
+}
+
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 {
        int err = 0;
        struct amdgpu_gfx_ras *ras = NULL;
 
        /* adev->gfx.ras is NULL, which means gfx does not
         * support ras function, then do nothing here.
         */
        if (!adev->gfx.ras)
                return 0;
@@ -1029,20 +1041,26 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 
        strcpy(ras->ras_block.ras_comm.name, "gfx");
        ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX;
        ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
        adev->gfx.ras_if = &ras->ras_block.ras_comm;
 
        /* If not define special ras_late_init function, use gfx default 
ras_late_init */
        if (!ras->ras_block.ras_late_init)
                ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init;
 
+       if (!ras->ras_block.ras_suspend)
+               ras->ras_block.ras_suspend = amdgpu_gfx_ras_suspend;
+
+       if (!ras->ras_block.ras_fini)
+               ras->ras_block.ras_fini = amdgpu_gfx_ras_fini;
+
        /* If not defined special ras_cb function, use default ras_cb */
        if (!ras->ras_block.ras_cb)
                ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb;
 
        return 0;
 }
 
 int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
                                                struct amdgpu_iv_entry *entry)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 77050f9884f20..54c1eb9c499ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -608,21 +608,22 @@ bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device 
*adev, int xcc_id,
 bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
                                               struct amdgpu_ring *ring);
 bool amdgpu_gfx_is_high_priority_graphics_queue(struct amdgpu_device *adev,
                                                struct amdgpu_ring *ring);
 bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me,
                                    int pipe, int queue);
 void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable);
 void amdgpu_gfx_off_ctrl_immediate(struct amdgpu_device *adev, bool enable);
 int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value);
 int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
-void amdgpu_gfx_ras_fini(struct amdgpu_device *adev);
+void amdgpu_gfx_ras_suspend(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
+void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u64 *value);
 int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32 *residency);
 int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool value);
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
                void *err_data,
                struct amdgpu_iv_entry *entry);
 int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
                                  struct amdgpu_irq_src *source,
                                  struct amdgpu_iv_entry *entry);
 uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t 
xcc_id);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c38af6d3599ed..bc91a5d0b9075 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -88,20 +88,23 @@ const char *ras_mca_block_string[] = {
        "mca_mp1",
        "mca_mpio",
        "mca_iohc",
 };
 
 struct amdgpu_ras_block_list {
        /* ras block link */
        struct list_head node;
 
        struct amdgpu_ras_block_object *ras_obj;
+
+       /* set by ras_late_init, cleared by ras_suspend/ras_fini */
+       bool active;
 };
 
 const char *get_ras_block_str(struct ras_common_if *ras_block)
 {
        if (!ras_block)
                return "NULL";
 
        if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT ||
            ras_block->block >= ARRAY_SIZE(ras_block_string))
                return "OUT OF RANGE";
@@ -4607,24 +4610,37 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
                                /* there should be no any reference. */
                                WARN_ON(alive_obj(obj));
                        }
                }
        }
 }
 
 void amdgpu_ras_suspend(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_block_list *node;
+       struct amdgpu_ras_block_object *obj;
 
        if (!adev->ras_enabled || !con)
                return;
 
+       /* run per-block ras_suspend before tearing down the RAS context */
+       list_for_each_entry(node, &adev->ras_list, node) {
+               if (!node->active)
+                       continue;
+
+               obj = node->ras_obj;
+               if (obj && obj->ras_suspend)
+                       obj->ras_suspend(adev, &obj->ras_comm);
+               node->active = false;
+       }
+
        amdgpu_ras_disable_all_features(adev, 0);
        /* Make sure all ras objects are disabled. */
        if (AMDGPU_RAS_GET_FEATURES(con->features))
                amdgpu_ras_disable_all_features(adev, 1);
 }
 
 int amdgpu_ras_late_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras_block_list *node, *tmp;
        struct amdgpu_ras_block_object *obj;
@@ -4664,22 +4680,29 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
                if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
                        continue;
 
                if (obj->ras_late_init) {
                        r = obj->ras_late_init(adev, &obj->ras_comm);
                        if (r) {
                                dev_err(adev->dev, "%s failed to execute 
ras_late_init! ret:%d\n",
                                        obj->ras_comm.name, r);
                                return r;
                        }
-               } else
-                       amdgpu_ras_block_late_init_default(adev, 
&obj->ras_comm);
+               } else {
+                       r = amdgpu_ras_block_late_init_default(adev, 
&obj->ras_comm);
+                       if (r) {
+                               dev_err(adev->dev, "%s failed to execute 
ras_block_late_init_default! ret:%d\n",
+                                       obj->ras_comm.name, r);
+                               return r;
+                       }
+               }
+               node->active = true;
        }
 
        amdgpu_ras_check_bad_page_status(adev);
 
        return 0;
 }
 
 /* do some fini work before IP fini as dependence */
 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 {
@@ -4704,25 +4727,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
        if (!adev->ras_enabled || !con)
                return 0;
 
        amdgpu_ras_critical_region_fini(adev);
        mutex_destroy(&con->critical_region_lock);
 
        list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
                if (ras_node->ras_obj) {
                        obj = ras_node->ras_obj;
-                       if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) 
&&
-                           obj->ras_fini)
+                       /* fall back to default cleanup if ras_suspend already 
ran */
+                       if (ras_node->active && obj->ras_fini)
                                obj->ras_fini(adev, &obj->ras_comm);
                        else
                                amdgpu_ras_block_late_fini_default(adev, 
&obj->ras_comm);
+                       ras_node->active = false;
                }
 
                /* Clear ras blocks from ras_list and free ras block list node 
*/
                list_del(&ras_node->node);
                kfree(ras_node);
        }
 
        amdgpu_ras_fs_fini(adev);
        amdgpu_ras_interrupt_remove_all(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index ff44190d7d98e..a86ab65aa2f07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -755,20 +755,21 @@ struct ras_debug_if {
        };
        int op;
 };
 
 struct amdgpu_ras_block_object {
        struct ras_common_if  ras_comm;
 
        int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
                                enum amdgpu_ras_block block, uint32_t 
sub_block_index);
        int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
+       void (*ras_suspend)(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
        void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
        ras_ih_cb ras_cb;
        const struct amdgpu_ras_block_hw_ops *hw_ops;
 };
 
 struct amdgpu_ras_block_hw_ops {
        int  (*ras_error_inject)(struct amdgpu_device *adev,
                        void *inject_if, uint32_t instance_mask);
        void (*query_ras_error_count)(struct amdgpu_device *adev, void 
*ras_error_status);
        void (*query_ras_error_status)(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e5a3735d98342..bec0720f70552 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4050,22 +4050,20 @@ static int gfx_v9_0_hw_init(struct amdgpu_ip_block 
*ip_block)
            !amdgpu_sriov_vf(adev))
                gfx_v9_4_2_set_power_brake_sequence(adev);
 
        return r;
 }
 
 static int gfx_v9_0_hw_fini(struct amdgpu_ip_block *ip_block)
 {
        struct amdgpu_device *adev = ip_block->adev;
 
-       if (amdgpu_irq_enabled(adev, &adev->gfx.cp_ecc_error_irq, 0))
-               amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
        if (amdgpu_irq_enabled(adev, &adev->gfx.priv_reg_irq, 0))
                amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        if (amdgpu_irq_enabled(adev, &adev->gfx.priv_inst_irq, 0))
                amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
        if (amdgpu_irq_enabled(adev, &adev->gfx.bad_op_irq, 0))
                amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);
 
        /* DF freeze and kcq disable will fail */
        if (!amdgpu_ras_intr_triggered())
                /* disable KCQ to avoid CPC touch memory not valid anymore */

base-commit: e08f39913bfa3e8c6a61fc30ee3870d7d156c19f
prerequisite-patch-id: 61de2a725d655efda7db2b8f3e3c2f80f22c7e3e
-- 
2.51.2

Reply via email to