GFX ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 26 ++++++++++---------------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 ++
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index ca17ffb01301..d3d970282df4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -204,6 +204,7 @@ struct amdgpu_gfx_funcs {
                                 u32 queue, u32 vmid);
        int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
        int (*query_ras_error_count) (struct amdgpu_device *adev, void 
*ras_error_status);
+       void (*reset_ras_error_count) (struct amdgpu_device *adev);
 };
 
 struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index a0f8cd9c0874..b746f26f933c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -738,9 +738,9 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring 
*ring);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
                                          void *ras_error_status);
-static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev);
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
                                     void *inject_if);
+static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);
 
 static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
                                uint64_t queue_mask)
@@ -1985,7 +1985,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = 
{
        .read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
        .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
        .ras_error_inject = &gfx_v9_0_ras_error_inject,
-       .query_ras_error_count = &gfx_v9_0_query_ras_error_count
+       .query_ras_error_count = &gfx_v9_0_query_ras_error_count,
+       .reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
 };
 
 static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = {
@@ -1996,7 +1997,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = 
{
        .read_wave_vgprs = &gfx_v9_0_read_wave_vgprs,
        .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,
        .ras_error_inject = &gfx_v9_4_ras_error_inject,
-       .query_ras_error_count = &gfx_v9_4_query_ras_error_count
+       .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
+       .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
 };
 
 static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -4348,18 +4350,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
                goto fail;
        }
 
-       switch (adev->asic_type)
-       {
-       case CHIP_VEGA20:
-               gfx_v9_0_clear_ras_edc_counter(adev);
-               break;
-       case CHIP_ARCTURUS:
-               gfx_v9_4_clear_ras_edc_counter(adev);
-               break;
-       default:
-               break;
-       }
-
 fail:
        amdgpu_ib_free(adev, &ib, NULL);
        dma_fence_put(f);
@@ -4402,6 +4392,10 @@ static int gfx_v9_0_ecc_late_init(void *handle)
        if (r)
                return r;
 
+       if (adev->gfx.funcs &&
+           adev->gfx.funcs->reset_ras_error_count)
+               adev->gfx.funcs->reset_ras_error_count(adev);
+
        r = amdgpu_gfx_ras_late_init(adev);
        if (r)
                return r;
@@ -6331,7 +6325,7 @@ static int gfx_v9_0_ras_error_count(const struct 
soc15_reg_entry *reg,
        return 0;
 }
 
-static void gfx_v9_0_clear_ras_edc_counter(struct amdgpu_device *adev)
+static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev)
 {
        int i, j, k;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
index f099f13d7f1e..17f1e7b69a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
@@ -893,7 +893,7 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device 
*adev,
        return 0;
 }
 
-void gfx_v9_4_clear_ras_edc_counter(struct amdgpu_device *adev)
+void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
 {
        int i, j, k;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
index 2e3f6f755ad4..1ffecc5c0f0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
@@ -32,4 +32,6 @@ int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
 int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
                                     void *inject_if);
 
+void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev);
+
 #endif /* __GFX_V9_4_H__ */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to