Whilst investigating some mysterious failures with hangcheck not running
during gem_busy/basic-hang-default, the question is why did we decide to
cancel the retire_work (which queues the hangcheck)? That decision is
based around GT activity, so include that information in the debug
report.

v2: Include the GT awake status in the error state

Signed-off-by: Chris Wilson <[email protected]>
Cc: Mika Kuoppala <[email protected]>
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 17 +++++++++++++----
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |  3 +++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 9b697fd03721..ee4bf6f71cab 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1341,14 +1341,17 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
        } else
                seq_printf(m, "Hangcheck inactive\n");
 
+       seq_printf(m, "GT active? %s\n", yesno(dev_priv->gt.awake));
+
        for_each_engine(engine, dev_priv, id) {
                struct intel_breadcrumbs *b = &engine->breadcrumbs;
                struct rb_node *rb;
 
                seq_printf(m, "%s:\n", engine->name);
-               seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+               seq_printf(m, "\tseqno = %x [current %x, last %x], inflight 
%d\n",
                           engine->hangcheck.seqno, seqno[id],
-                          intel_engine_last_submit(engine));
+                          intel_engine_last_submit(engine),
+                          engine->timeline->inflight_seqnos);
                seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? 
%s\n",
                           yesno(intel_engine_has_waiter(engine)),
                           yesno(test_bit(engine->id,
@@ -3240,6 +3243,11 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
 
        intel_runtime_pm_get(dev_priv);
 
+       seq_printf(m, "GT awake? %s\n",
+                  yesno(dev_priv->gt.awake));
+       seq_printf(m, "Global active requests: %d\n",
+                  dev_priv->gt.active_requests);
+
        for_each_engine(engine, dev_priv, id) {
                struct intel_breadcrumbs *b = &engine->breadcrumbs;
                struct drm_i915_gem_request *rq;
@@ -3247,11 +3255,12 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
                u64 addr;
 
                seq_printf(m, "%s\n", engine->name);
-               seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d 
ms]\n",
+               seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d 
ms], inflight %d\n",
                           intel_engine_get_seqno(engine),
                           intel_engine_last_submit(engine),
                           engine->hangcheck.seqno,
-                          jiffies_to_msecs(jiffies - 
engine->hangcheck.action_timestamp));
+                          jiffies_to_msecs(jiffies - 
engine->hangcheck.action_timestamp),
+                          engine->timeline->inflight_seqnos);
 
                rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7a2f2e4468d6..66f19924828a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -936,6 +936,7 @@ struct i915_gpu_state {
 
        char error_msg[128];
        bool simulated;
+       bool awake;
        int iommu;
        u32 reset_count;
        u32 suspend_count;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 76855e1d8795..b4ae1464e0ab 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -632,6 +632,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf 
*m,
                           CSR_VERSION_MINOR(csr->version));
        }
 
+       err_printf(m, "GT awake: %s\n", yesno(error->awake));
        err_printf(m, "EIR: 0x%08x\n", error->eir);
        err_printf(m, "IER: 0x%08x\n", error->ier);
        for (i = 0; i < error->ngtier; i++)
@@ -1653,6 +1654,8 @@ static void i915_error_capture_msg(struct 
drm_i915_private *dev_priv,
 static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
                                   struct i915_gpu_state *error)
 {
+       error->awake = dev_priv->gt.awake;
+
        error->iommu = -1;
 #ifdef CONFIG_INTEL_IOMMU
        error->iommu = intel_iommu_gfx_mapped;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to