RE: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP
[AMD Public Use] Series is: Reviewed-by: Guchun Chen Regards, Guchun -Original Message- From: Hawking Zhang Sent: Monday, March 2, 2020 6:34 PM To: amd-gfx@lists.freedesktop.org; Clements, John ; Li, Dennis ; Chen, Guchun ; Zhou1, Tao ; Deucher, Alexander Cc: Zhang, Hawking Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP HDP ras error counters are dirty ones after cold reboot Read operation is needed to reset them to 0 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 - drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a58b0cf9da51..b735e20888a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs { /* invalidate hdp read cache */ void (*invalidate_hdp)(struct amdgpu_device *adev, struct amdgpu_ring *ring); + void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev); /* check if the asic needs a full reset of if soft reset will work */ bool (*need_full_reset)(struct amdgpu_device *adev); /* initialize doorbell layout for specific asic*/ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index b746f26f933c..efd52bcf8785 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry gfx_v9_0_edc_counter_regs[] = { { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16}, { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2}, { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6}, - { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1}, }; static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 4aa5b9c8e43b..6b717691d554 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device *adev) /* change this when we implement soft reset */ return true; } + +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device +*adev) { + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP)) + return; + /*read back hdp ras counter to reset it to 0 */ + RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT); +} + static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs = .get_config_memsize = _get_config_memsize, .flush_hdp = _flush_hdp, .invalidate_hdp = _invalidate_hdp, + .reset_hdp_ras_error_count = _reset_hdp_ras_error_count, .need_full_reset = _need_full_reset, .init_doorbell_index = _doorbell_index_init, .get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle) if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_get_irq(adev); + if (adev->asic_funcs && + adev->asic_funcs->reset_hdp_ras_error_count) + adev->asic_funcs->reset_hdp_ras_error_count(adev); + if (adev->nbio.funcs->ras_late_init) r = adev->nbio.funcs->ras_late_init(adev); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP
[AMD Official Use Only - Internal Distribution Only] The series is: Reviewed-by: Tao Zhou > -Original Message- > From: Hawking Zhang > Sent: 2020年3月2日 18:34 > To: amd-gfx@lists.freedesktop.org; Clements, John > ; Li, Dennis ; Chen, > Guchun ; Zhou1, Tao ; > Deucher, Alexander > Cc: Zhang, Hawking > Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for > HDP > > HDP ras error counters are dirty ones after cold reboot Read operation is > needed to reset them to 0 > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 - > drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++ > 3 files changed, 15 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index a58b0cf9da51..b735e20888a7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs { > /* invalidate hdp read cache */ > void (*invalidate_hdp)(struct amdgpu_device *adev, > struct amdgpu_ring *ring); > + void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev); > /* check if the asic needs a full reset of if soft reset will work */ > bool (*need_full_reset)(struct amdgpu_device *adev); > /* initialize doorbell layout for specific asic*/ diff --git > a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index b746f26f933c..efd52bcf8785 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry > gfx_v9_0_edc_counter_regs[] = { > { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16}, > { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2}, > { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6}, > - { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1}, > }; > > static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev) > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c > b/drivers/gpu/drm/amd/amdgpu/soc15.c > index 4aa5b9c8e43b..6b717691d554 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c > @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct > amdgpu_device *adev) > /* change this when we implement soft reset */ > return true; > } > + > +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device > +*adev) { > + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP)) > + return; > + /*read back hdp ras counter to reset it to 0 */ > + RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT); > +} > + > static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t > *count0, >uint64_t *count1) > { > @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs > vega20_asic_funcs = > .get_config_memsize = _get_config_memsize, > .flush_hdp = _flush_hdp, > .invalidate_hdp = _invalidate_hdp, > + .reset_hdp_ras_error_count = _reset_hdp_ras_error_count, > .need_full_reset = _need_full_reset, > .init_doorbell_index = _doorbell_index_init, > .get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10 > @@ static int soc15_common_late_init(void *handle) > if (amdgpu_sriov_vf(adev)) > xgpu_ai_mailbox_get_irq(adev); > > + if (adev->asic_funcs && > + adev->asic_funcs->reset_hdp_ras_error_count) > + adev->asic_funcs->reset_hdp_ras_error_count(adev); > + > if (adev->nbio.funcs->ras_late_init) > r = adev->nbio.funcs->ras_late_init(adev); > > -- > 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP
[AMD Public Use] Series is: Reviewed-by: Alex Deucher From: Hawking Zhang Sent: Monday, March 2, 2020 5:33 AM To: amd-gfx@lists.freedesktop.org ; Clements, John ; Li, Dennis ; Chen, Guchun ; Zhou1, Tao ; Deucher, Alexander Cc: Zhang, Hawking Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP HDP ras error counters are dirty ones after cold reboot Read operation is needed to reset them to 0 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 - drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a58b0cf9da51..b735e20888a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs { /* invalidate hdp read cache */ void (*invalidate_hdp)(struct amdgpu_device *adev, struct amdgpu_ring *ring); + void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev); /* check if the asic needs a full reset of if soft reset will work */ bool (*need_full_reset)(struct amdgpu_device *adev); /* initialize doorbell layout for specific asic*/ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index b746f26f933c..efd52bcf8785 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry gfx_v9_0_edc_counter_regs[] = { { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16}, { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2}, { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6}, - { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1}, }; static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 4aa5b9c8e43b..6b717691d554 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device *adev) /* change this when we implement soft reset */ return true; } + +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP)) + return; + /*read back hdp ras counter to reset it to 0 */ + RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT); +} + static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs = .get_config_memsize = _get_config_memsize, .flush_hdp = _flush_hdp, .invalidate_hdp = _invalidate_hdp, + .reset_hdp_ras_error_count = _reset_hdp_ras_error_count, .need_full_reset = _need_full_reset, .init_doorbell_index = _doorbell_index_init, .get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle) if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_get_irq(adev); + if (adev->asic_funcs && + adev->asic_funcs->reset_hdp_ras_error_count) + adev->asic_funcs->reset_hdp_ras_error_count(adev); + if (adev->nbio.funcs->ras_late_init) r = adev->nbio.funcs->ras_late_init(adev); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP
HDP ras error counters are dirty ones after cold reboot Read operation is needed to reset them to 0 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 - drivers/gpu/drm/amd/amdgpu/soc15.c| 14 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a58b0cf9da51..b735e20888a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs { /* invalidate hdp read cache */ void (*invalidate_hdp)(struct amdgpu_device *adev, struct amdgpu_ring *ring); + void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev); /* check if the asic needs a full reset of if soft reset will work */ bool (*need_full_reset)(struct amdgpu_device *adev); /* initialize doorbell layout for specific asic*/ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index b746f26f933c..efd52bcf8785 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry gfx_v9_0_edc_counter_regs[] = { { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16}, { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2}, { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6}, - { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1}, }; static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 4aa5b9c8e43b..6b717691d554 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct amdgpu_device *adev) /* change this when we implement soft reset */ return true; } + +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP)) + return; + /*read back hdp ras counter to reset it to 0 */ + RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT); +} + static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t *count0, uint64_t *count1) { @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs vega20_asic_funcs = .get_config_memsize = _get_config_memsize, .flush_hdp = _flush_hdp, .invalidate_hdp = _invalidate_hdp, + .reset_hdp_ras_error_count = _reset_hdp_ras_error_count, .need_full_reset = _need_full_reset, .init_doorbell_index = _doorbell_index_init, .get_pcie_usage = _get_pcie_usage, @@ -1239,6 +1249,10 @@ static int soc15_common_late_init(void *handle) if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_get_irq(adev); + if (adev->asic_funcs && + adev->asic_funcs->reset_hdp_ras_error_count) + adev->asic_funcs->reset_hdp_ras_error_count(adev); + if (adev->nbio.funcs->ras_late_init) r = adev->nbio.funcs->ras_late_init(adev); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx