From: Fernando Pacheco <fernando.pach...@intel.com>

Correctable and uncorrectable Shared Local Memory (SLM)
ECC errors will be counted in two different Thread Dispatch
Logic (TDL) registers. GuC will receive a message
from TDL when the first correctable/uncorrectable error is
detected by SLM (first after a reset or register clear). This
message is then forwarded to the appropriate severity register.

Correctable errors will route to kernel driver and uncorrectable errors
are expected to route as PCIe Error. Although the option exists to route
both as interrupts.

Service the interrupt and read TDL registers for error count.

Cc: Paulo Zanoni <paulo.r.zan...@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>
Cc: Fernando Pacheco <fernando.pach...@intel.com>
Cc: Radhakrishna Sripada <radhakrishna.srip...@intel.com>
Signed-off-by: Fernando Pacheco <fernando.pach...@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demar...@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 10 +++++++++-
 drivers/gpu/drm/i915/i915_reg.h |  7 +++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 17e679b910da..ca35edef492d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2536,7 +2536,7 @@ gen12_gt_hw_error_handler(struct drm_i915_private * const 
i915,
 {
        void __iomem * const regs = i915->uncore.regs;
        const char *hw_err_str = hardware_error_type_to_str(hw_err);
-       u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+       u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR | SLM_ERROR);
        u32 errstat;
 
        lockdep_assert_held(&i915->irq_lock);
@@ -2565,6 +2565,14 @@ gen12_gt_hw_error_handler(struct drm_i915_private * 
const i915,
        if (errstat & EU_IC_ERROR)
                DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
 
+       if (errstat & SLM_ERROR) {
+               struct drm_i915_private *dev_priv = i915;
+
+               DRM_ERROR("detected %u SLM %s hardware error(s)\n",
+                         I915_READ(SLM_ECC_ERROR_CNTR(hw_err)),
+                         hw_err_str);
+       }
+
        /*
         * TODO: The remaining GT errors don't have a
         * need for targeted logging at the moment. We
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 40cb361b4254..b9c142f86611 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7765,6 +7765,13 @@ enum hardware_error {
                                                _ERR_STAT_GT_NONFATAL))
 #define  EU_GRF_ERROR                  (1 << 15)
 #define  EU_IC_ERROR                   (1 << 14)
+#define  SLM_ERROR                     (1 << 13)
+
+#define _SLM_ECC_ERROR_CNT             0xe7f4
+#define _SLM_UNCORR_ECC_ERROR_CNT      0xe7c0
+#define SLM_ECC_ERROR_CNTR(x)          _MMIO((x) == HARDWARE_ERROR_CORRECTABLE 
? \
+                                               _SLM_ECC_ERROR_CNT : \
+                                               _SLM_UNCORR_ECC_ERROR_CNT)
 
 #define GEN11_RENDER_COPY_INTR_ENABLE  _MMIO(0x190030)
 #define GEN11_VCS_VECS_INTR_ENABLE     _MMIO(0x190034)
-- 
2.26.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to