On Gen6+ we have other rings which may be in use. We haven't hung if the
blit or media ring is still going

Before rebase:
Reviewed-by: Daniel Vetter <[email protected]>
Signed-off-by: Ben Widawsky <[email protected]>
---
 drivers/gpu/drm/i915/i915_drv.h |    5 +-
 drivers/gpu/drm/i915/i915_irq.c |  143 +++++++++++++++++++++++++++------------
 2 files changed, 102 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 567275c..edfa8be 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -323,9 +323,8 @@ typedef struct drm_i915_private {
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
        struct timer_list hangcheck_timer;
        int hangcheck_count;
-       uint32_t last_acthd;
-       uint32_t last_instdone;
-       uint32_t last_instdone1;
+       uint32_t last_acthd[I915_NUM_RINGS];
+       uint64_t last_instdone[I915_NUM_RINGS];
 
        unsigned long cfb_size;
        unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 97e338b..6b6abe1 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -948,7 +948,7 @@ static void i915_capture_error_state(struct drm_device *dev)
                error->instdone[RCS] = I915_READ(INSTDONE_I965);
                error->instps[RCS] = I915_READ(INSTPS);
                error->instdone1 = I915_READ(INSTDONE1);
-               error->acthd = I915_READ(ACTHD_I965);
+               error->acthd[RCS] = I915_READ(ACTHD_I965);
                error->bbaddr = I915_READ64(BB_ADDR);
        } else {
                error->ipeir[RCS] = I915_READ(IPEIR);
@@ -1666,6 +1666,85 @@ static bool kick_ring(struct intel_ring_buffer *ring)
        return false;
 }
 
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       uint64_t instdone = 0, instdone1 = 0, vcs_instdone = 0, bcs_instdone = 
0;
+       bool stuck;
+
+       switch (INTEL_INFO(dev)->gen) {
+       case 7:
+       case 6:
+               bcs_instdone = I915_READ(BCS_INSTDONE);
+       case 5:
+               vcs_instdone = I915_READ(VCS_INSTDONE);
+       case 4:
+               instdone = I915_READ(INSTDONE_I965);
+               instdone1 = I915_READ(INSTDONE1);
+               break;
+       case 3:
+       case 2:
+               instdone = I915_READ(INSTDONE);
+               break;
+       }
+
+       stuck =
+           (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+           (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+           (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+       dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+       dev_priv->last_instdone[VCS] = vcs_instdone;
+       dev_priv->last_instdone[BCS] = bcs_instdone;
+
+       return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+       bool stuck = false;
+
+       switch (INTEL_INFO(dev)->gen) {
+       case 7:
+       case 6:
+               bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+       case 5:
+               vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+       case 4:
+       case 3:
+       case 2:
+               acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+               break;
+       }
+
+       stuck =
+           (dev_priv->last_acthd[RCS] == acthd) &&
+           (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+           (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+       dev_priv->last_acthd[RCS] = acthd;
+       dev_priv->last_acthd[VCS] = vcs_acthd;
+       dev_priv->last_acthd[BCS] = bcs_acthd;
+
+       return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+       #define NUM_HANGCHECKS_TO_RESET 1
+
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       if (dev_priv->hangcheck_count++ < NUM_HANGCHECKS_TO_RESET)
+               return false;
+
+       return acthd_stuck(dev) && instdone_stuck(dev);
+}
+
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1676,13 +1755,11 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
        struct drm_device *dev = (struct drm_device *)data;
        drm_i915_private_t *dev_priv = dev->dev_private;
-       uint32_t acthd, instdone, instdone1;
        bool err = false;
 
        if (!i915_enable_hangcheck)
                return;
 
-       /* If all work is done then ACTHD clearly hasn't advanced. */
        if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
            i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
            i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
@@ -1692,50 +1769,30 @@ void i915_hangcheck_elapsed(unsigned long data)
                return;
        }
 
-       if (INTEL_INFO(dev)->gen < 4) {
-               acthd = I915_READ(ACTHD);
-               instdone = I915_READ(INSTDONE);
-               instdone1 = 0;
-       } else {
-               acthd = I915_READ(ACTHD_I965);
-               instdone = I915_READ(INSTDONE_I965);
-               instdone1 = I915_READ(INSTDONE1);
-       }
-
-       if (dev_priv->last_acthd == acthd &&
-           dev_priv->last_instdone == instdone &&
-           dev_priv->last_instdone1 == instdone1) {
-               if (dev_priv->hangcheck_count++ > 1) {
-                       DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+       if (gpu_stuck(dev)) {
+               DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 
-                       if (!IS_GEN2(dev)) {
-                               /* Is the chip hanging on a WAIT_FOR_EVENT?
-                                * If so we can simply poke the RB_WAIT bit
-                                * and break the hang. This should work on
-                                * all but the second generation chipsets.
-                                */
-
-                               if (kick_ring(&dev_priv->ring[RCS]))
-                                       goto repeat;
+               if (!IS_GEN2(dev)) {
+                       /* Is the chip hanging on a WAIT_FOR_EVENT?
+                        * If so we can simply poke the RB_WAIT bit
+                        * and break the hang. This should work on
+                        * all but the second generation chipsets.
+                        */
 
-                               if (HAS_BSD(dev) &&
-                                   kick_ring(&dev_priv->ring[VCS]))
-                                       goto repeat;
+                       if (kick_ring(&dev_priv->ring[RCS]))
+                               goto repeat;
 
-                               if (HAS_BLT(dev) &&
-                                   kick_ring(&dev_priv->ring[BCS]))
-                                       goto repeat;
-                       }
+                       if (HAS_BSD(dev) &&
+                           kick_ring(&dev_priv->ring[VCS]))
+                               goto repeat;
 
-                       i915_handle_error(dev, true);
-                       return;
+                       if (HAS_BLT(dev) &&
+                           kick_ring(&dev_priv->ring[BCS]))
+                               goto repeat;
                }
-       } else {
-               dev_priv->hangcheck_count = 0;
 
-               dev_priv->last_acthd = acthd;
-               dev_priv->last_instdone = instdone;
-               dev_priv->last_instdone1 = instdone1;
+               i915_handle_error(dev, true);
+               return;
        }
 
 repeat:
-- 
1.7.6.4

_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to