RE: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

2020-03-02 Thread Chen, Guchun
[AMD Public Use]

Series is:
Reviewed-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: Hawking Zhang  
Sent: Monday, March 2, 2020 6:34 PM
To: amd-gfx@lists.freedesktop.org; Clements, John ; Li, 
Dennis ; Chen, Guchun ; Zhou1, Tao 
; Deucher, Alexander 
Cc: Zhang, Hawking 
Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

HDP ras error counters are dirty ones after cold reboot Read operation is 
needed to reset them to 0

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a58b0cf9da51..b735e20888a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
/* invalidate hdp read cache */
void (*invalidate_hdp)(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
+   void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
/* check if the asic needs a full reset of if soft reset will work */
bool (*need_full_reset)(struct amdgpu_device *adev);
/* initialize doorbell layout for specific asic*/ diff --git 
a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b746f26f933c..efd52bcf8785 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry 
gfx_v9_0_edc_counter_regs[] = {
{ SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
-   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
 };
 
 static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev) diff 
--git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 4aa5b9c8e43b..6b717691d554 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device 
*adev)
/* change this when we implement soft reset */
return true;
 }
+
+static void vega20_reset_hdp_ras_error_count(struct amdgpu_device 
+*adev) {
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
+   return;
+   /*read back hdp ras counter to reset it to 0 */
+   RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
+}
+
 static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
 uint64_t *count1)
 {
@@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
.get_config_memsize = _get_config_memsize,
.flush_hdp = _flush_hdp,
.invalidate_hdp = _invalidate_hdp,
+   .reset_hdp_ras_error_count = _reset_hdp_ras_error_count,
.need_full_reset = _need_full_reset,
.init_doorbell_index = _doorbell_index_init,
.get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10 @@ static 
int soc15_common_late_init(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_get_irq(adev);
 
+   if (adev->asic_funcs &&
+   adev->asic_funcs->reset_hdp_ras_error_count)
+   adev->asic_funcs->reset_hdp_ras_error_count(adev);
+
if (adev->nbio.funcs->ras_late_init)
r = adev->nbio.funcs->ras_late_init(adev);
 
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

2020-03-02 Thread Zhou1, Tao
[AMD Official Use Only - Internal Distribution Only]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: 2020年3月2日 18:34
> To: amd-gfx@lists.freedesktop.org; Clements, John
> ; Li, Dennis ; Chen,
> Guchun ; Zhou1, Tao ;
> Deucher, Alexander 
> Cc: Zhang, Hawking 
> Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for
> HDP
> 
> HDP ras error counters are dirty ones after cold reboot Read operation is
> needed to reset them to 0
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
>  drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++
>  3 files changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a58b0cf9da51..b735e20888a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
>   /* invalidate hdp read cache */
>   void (*invalidate_hdp)(struct amdgpu_device *adev,
>  struct amdgpu_ring *ring);
> + void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
>   /* check if the asic needs a full reset of if soft reset will work */
>   bool (*need_full_reset)(struct amdgpu_device *adev);
>   /* initialize doorbell layout for specific asic*/ diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b746f26f933c..efd52bcf8785 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry
> gfx_v9_0_edc_counter_regs[] = {
> { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
> { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
> { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
> -   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
>  };
> 
>  static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index 4aa5b9c8e43b..6b717691d554 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct
> amdgpu_device *adev)
>   /* change this when we implement soft reset */
>   return true;
>  }
> +
> +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device
> +*adev) {
> + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
> + return;
> + /*read back hdp ras counter to reset it to 0 */
> + RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
> +}
> +
>  static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t
> *count0,
>uint64_t *count1)
>  {
> @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs
> vega20_asic_funcs =
>   .get_config_memsize = _get_config_memsize,
>   .flush_hdp = _flush_hdp,
>   .invalidate_hdp = _invalidate_hdp,
> + .reset_hdp_ras_error_count = _reset_hdp_ras_error_count,
>   .need_full_reset = _need_full_reset,
>   .init_doorbell_index = _doorbell_index_init,
>   .get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10
> @@ static int soc15_common_late_init(void *handle)
>   if (amdgpu_sriov_vf(adev))
>   xgpu_ai_mailbox_get_irq(adev);
> 
> + if (adev->asic_funcs &&
> + adev->asic_funcs->reset_hdp_ras_error_count)
> + adev->asic_funcs->reset_hdp_ras_error_count(adev);
> +
>   if (adev->nbio.funcs->ras_late_init)
>   r = adev->nbio.funcs->ras_late_init(adev);
> 
> --
> 2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

2020-03-02 Thread Deucher, Alexander
[AMD Public Use]

Series is:
Reviewed-by: Alex Deucher 

From: Hawking Zhang 
Sent: Monday, March 2, 2020 5:33 AM
To: amd-gfx@lists.freedesktop.org ; Clements, 
John ; Li, Dennis ; Chen, Guchun 
; Zhou1, Tao ; Deucher, Alexander 

Cc: Zhang, Hawking 
Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

HDP ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a58b0cf9da51..b735e20888a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
 /* invalidate hdp read cache */
 void (*invalidate_hdp)(struct amdgpu_device *adev,
struct amdgpu_ring *ring);
+   void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
 /* check if the asic needs a full reset of if soft reset will work */
 bool (*need_full_reset)(struct amdgpu_device *adev);
 /* initialize doorbell layout for specific asic*/
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b746f26f933c..efd52bcf8785 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry 
gfx_v9_0_edc_counter_regs[] = {
{ SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
-   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
 };

 static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 4aa5b9c8e43b..6b717691d554 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device 
*adev)
 /* change this when we implement soft reset */
 return true;
 }
+
+static void vega20_reset_hdp_ras_error_count(struct amdgpu_device *adev)
+{
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
+   return;
+   /*read back hdp ras counter to reset it to 0 */
+   RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
+}
+
 static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
  uint64_t *count1)
 {
@@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
 .get_config_memsize = _get_config_memsize,
 .flush_hdp = _flush_hdp,
 .invalidate_hdp = _invalidate_hdp,
+   .reset_hdp_ras_error_count = _reset_hdp_ras_error_count,
 .need_full_reset = _need_full_reset,
 .init_doorbell_index = _doorbell_index_init,
 .get_pcie_usage = _get_pcie_usage,
@@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle)
 if (amdgpu_sriov_vf(adev))
 xgpu_ai_mailbox_get_irq(adev);

+   if (adev->asic_funcs &&
+   adev->asic_funcs->reset_hdp_ras_error_count)
+   adev->asic_funcs->reset_hdp_ras_error_count(adev);
+
 if (adev->nbio.funcs->ras_late_init)
 r = adev->nbio.funcs->ras_late_init(adev);

--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

2020-03-02 Thread Hawking Zhang
HDP ras error counters are dirty ones after cold reboot
Read operation is needed to reset them to 0

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a58b0cf9da51..b735e20888a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
/* invalidate hdp read cache */
void (*invalidate_hdp)(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
+   void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
/* check if the asic needs a full reset of if soft reset will work */
bool (*need_full_reset)(struct amdgpu_device *adev);
/* initialize doorbell layout for specific asic*/
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b746f26f933c..efd52bcf8785 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry 
gfx_v9_0_edc_counter_regs[] = {
{ SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
{ SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
-   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
 };
 
 static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 4aa5b9c8e43b..6b717691d554 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device 
*adev)
/* change this when we implement soft reset */
return true;
 }
+
+static void vega20_reset_hdp_ras_error_count(struct amdgpu_device *adev)
+{
+   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
+   return;
+   /*read back hdp ras counter to reset it to 0 */
+   RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
+}
+
 static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0,
 uint64_t *count1)
 {
@@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs =
.get_config_memsize = _get_config_memsize,
.flush_hdp = _flush_hdp,
.invalidate_hdp = _invalidate_hdp,
+   .reset_hdp_ras_error_count = _reset_hdp_ras_error_count,
.need_full_reset = _need_full_reset,
.init_doorbell_index = _doorbell_index_init,
.get_pcie_usage = _get_pcie_usage,
@@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_get_irq(adev);
 
+   if (adev->asic_funcs &&
+   adev->asic_funcs->reset_hdp_ras_error_count)
+   adev->asic_funcs->reset_hdp_ras_error_count(adev);
+
if (adev->nbio.funcs->ras_late_init)
r = adev->nbio.funcs->ras_late_init(adev);
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx