amdgpu: update athub interrupt harvesting handle

Zhang, Hawking Mon, 21 Sep 2020 07:40:47 -0700

[AMD Public Use]

Reviewed-by: Hawking Zhang <hawking.zh...@amd.com>


Regards,
Hawking
-----Original Message-----
From: Stanley.Yang <stanley.y...@amd.com> 
Sent: Monday, September 21, 2020 21:48
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <hawking.zh...@amd.com>; Chen, Guchun <guchun.c...@amd.com>; 
Clements, John <john.cleme...@amd.com>; Li, Dennis <dennis...@amd.com>; Zhou1, 
Tao <tao.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>
Subject: [PATCH V4 1/1] drm/amdgpu: update athub interrupt harvesting handle

GCEA/MMHUB EA error should not result to DF freeze, this is fixed in next 
generation, but for some reasons the GCEA/MMHUB EA error will result to DF 
freeze in previous generation, diver should avoid to indicate GCEA/MMHUB EA 
error as hw fatal error in kernel message by read GCEA/MMHUB err status 
registers.

Changed from V1:
    make query_ras_error_status function more general
    make read mmhub er status register more friendly

Changed from V2:
    move ras error status query function into do_recovery workqueue

Changed from V3:
    remove useless code from V2, print GCEA error status
    instance number

Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h       |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 43 ++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c         | 29 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h         |  2 +
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c       | 29 +++++++++++++
 .../amd/include/asic_reg/gc/gc_9_4_1_offset.h |  4 +-
 8 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index a611e78dd4ba..258498cbf1eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs {
        int (*query_ras_error_count) (struct amdgpu_device *adev, void 
*ras_error_status);
        void (*reset_ras_error_count) (struct amdgpu_device *adev);
        void (*init_spm_golden)(struct amdgpu_device *adev);
+       void (*query_ras_error_status) (struct amdgpu_device *adev);
 };
 
 struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 0c43d7fe893c..1ae9bdae7311 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs {
                                uint64_t page_table_base);
        void (*update_power_gating)(struct amdgpu_device *adev,
                                 bool enable);
+       void (*query_ras_error_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_mmhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e5ea14774c0c..40614ac9a111 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct 
amdgpu_device *adev)
        }
 }
 
+/* Parse RdRspStatus and WrRspStatus */ void 
+amdgpu_ras_error_status_query(struct amdgpu_device *adev,
+               struct ras_query_if *info)
+{
+       /*
+        * Only two block need to query read/write
+        * RspStatus at current state
+        */
+       switch (info->head.block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->query_ras_error_status)
+                       adev->gfx.funcs->query_ras_error_status(adev);
+               break;
+       case AMDGPU_RAS_BLOCK__MMHUB:
+               if (adev->mmhub.funcs->query_ras_error_status)
+                       adev->mmhub.funcs->query_ras_error_status(adev);
+               break;
+       default:
+               break;
+       }
+}
+
+static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_manager *obj;
+
+       if (!con)
+               return;
+
+       list_for_each_entry(obj, &con->head, node) {
+               struct ras_query_if info = {
+                       .head = obj->head,
+               };
+
+               amdgpu_ras_error_status_query(adev, &info);
+       }
+}
+
 /* recovery begin */
 
 /* return 0 on success.
@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
                }
 
                list_for_each_entry(remote_adev,
-                               device_list_handle, gmc.xgmi.head)
+                               device_list_handle, gmc.xgmi.head) {
+                       amdgpu_ras_query_err_status(remote_adev);
                        amdgpu_ras_log_on_err_counter(remote_adev);
+               }
 
                amdgpu_put_xgmi_hive(hive);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index d898c9ff3526..adee0177654e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = 
{
        .ras_error_inject = &gfx_v9_4_ras_error_inject,
        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
+       .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) diff --git 
a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index bd85aed3523a..bc699d680ce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -992,3 +992,32 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, 
void *inject_if)
 
        return ret;
 }
+
+static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs =
+       { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 };
+
+void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev) {
+       uint32_t i, j;
+       uint32_t reg_value;
+
+       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+               return;
+
+       mutex_lock(&adev->grbm_idx_mutex);
+
+       for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) {
+               for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance;
+                    j++) {
+                       gfx_v9_4_select_se_sh(adev, i, 0, j);
+                       reg_value = RREG32(SOC15_REG_ENTRY_OFFSET(
+                               gfx_v9_4_rdrsp_status_regs));
+                       if (reg_value)
+                               dev_warn(adev->dev, "GCEA err detected at 
instance: %d, status: 0x%x!\n",
+                                               j, reg_value);
+               }
+       }
+
+       gfx_v9_4_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+       mutex_unlock(&adev->grbm_idx_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
index 1ffecc5c0f0a..875f18473a98 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
@@ -34,4 +34,6 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
 
 void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev);
 
+void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev);
+
 #endif /* __GFX_V9_4_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 6c6ad529c65c..c2ef8142136e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1624,6 +1624,34 @@ static void mmhub_v9_4_reset_ras_error_count(struct 
amdgpu_device *adev)
        }
 }
 
+static const struct soc15_reg_entry mmhub_v9_4_err_status_regs[] = {
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA3_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA4_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA5_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA6_ERR_STATUS), 0, 0, 0 },
+       { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA7_ERR_STATUS), 0, 0, 0 }, };
+
+static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device 
+*adev) {
+       int i;
+       uint32_t reg_value;
+
+       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
+               reg_value =
+                       
RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
+               if (reg_value)
+                       dev_warn(adev->dev, "MMHUB EA err detected at instance: 
%d, status: 0x%x!\n",
+                                       i, reg_value);
+       }
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
        .ras_late_init = amdgpu_mmhub_ras_late_init,
        .query_ras_error_count = mmhub_v9_4_query_ras_error_count, @@ -1636,4 
+1664,5 @@ const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
        .set_clockgating = mmhub_v9_4_set_clockgating,
        .get_clockgating = mmhub_v9_4_get_clockgating,
        .setup_vm_pt_regs = mmhub_v9_4_setup_vm_pt_regs,
+       .query_ras_error_status = mmhub_v9_4_query_ras_error_status,
 };
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
index f41556abfbbc..629a8a3b55e9 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h
@@ -205,6 +205,8 @@
 #define mmGCEA_EDC_CNT2_BASE_IDX                                               
                        0
 #define mmGCEA_EDC_CNT3                                                        
                        0x071b
 #define mmGCEA_EDC_CNT3_BASE_IDX                                               
                        0
+#define mmGCEA_ERR_STATUS                                                      
                        0x0712
+#define mmGCEA_ERR_STATUS_BASE_IDX                                             
                        0
 
 // addressBlock: gc_gfxudec
 // base address: 0x30000
@@ -261,4 +263,4 @@
 #define mmRLC_EDC_CNT2                                                         
                        0x4d41
 #define mmRLC_EDC_CNT2_BASE_IDX                                                
                        1
 
-#endif
\ No newline at end of file
+#endif
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH V4 1/1] drm/amdgpu: update athub interrupt harvesting handle

Reply via email to