On 1/18/2023 10:49 PM, john.c.harri...@intel.com wrote:
From: John Harrison <john.c.harri...@intel.com>

The debugfs dump of requests was confused about what state requires
the execlist lock versus the GuC lock. There was also a bunch of
duplicated messy code between it and the error capture code.

So refactor the hung request search into a re-usable function. And
reduce the span of the execlist state lock to only the execlist
specific code paths.

Signed-off-by: John Harrison <john.c.harri...@intel.com>
---
  drivers/gpu/drm/i915/gt/intel_context.c   | 29 +++++++++++++
  drivers/gpu/drm/i915/gt/intel_context.h   |  3 ++
  drivers/gpu/drm/i915/gt/intel_engine_cs.c | 51 +++++++++++------------
  drivers/gpu/drm/i915/i915_gpu_error.c     | 27 ++----------
  4 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index e7c5509c48ef1..a61f052092ed9 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -559,6 +559,35 @@ struct i915_request 
*intel_context_find_active_request(struct intel_context *ce)
        return active;
  }
+void intel_get_hung_entity(struct intel_engine_cs *engine,
+                          struct intel_context **ce, struct i915_request **rq)
IMO this belongs in the engine_cs.c file, given that the engine is the input. Might also be worth renaming to intel_engine_*

+{
+       unsigned long flags;
+
+       *ce = intel_engine_get_hung_context(engine);
+       if (*ce) {
+               intel_engine_clear_hung_context(engine);
+
+               /* This will reference count the request (if found) */
+               *rq = intel_context_find_active_request(*ce);
+
+               return;
+       }
+
+       /*
+        * Getting here with GuC enabled means it is a forced error capture
+        * with no actual hang. So, no need to attempt the execlist search.
+        */
+       if (intel_uc_uses_guc_submission(&engine->gt->uc))
+               return;
+
+       spin_lock_irqsave(&engine->sched_engine->lock, flags);
+       *rq = intel_engine_execlist_find_hung_request(engine);
+       if (*rq)
+               *rq = i915_request_get_rcu(*rq);
+       spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
+}
+
  void intel_context_bind_parent_child(struct intel_context *parent,
                                     struct intel_context *child)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index fb62b7b8cbcda..ca50f3312a941 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -271,6 +271,9 @@ struct i915_request *intel_context_create_request(struct 
intel_context *ce);
  struct i915_request *
  intel_context_find_active_request(struct intel_context *ce);
+void intel_get_hung_entity(struct intel_engine_cs *engine,
+                          struct intel_context **ce, struct i915_request **rq);
+
  static inline bool intel_context_is_barrier(const struct intel_context *ce)
  {
        return test_bit(CONTEXT_BARRIER_BIT, &ce->flags);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 6a082658d0082..5e173dfc8849e 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2216,11 +2216,27 @@ void intel_engine_dump_active_requests(struct list_head 
*requests,
        }
  }
-static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
+static void execlist_dump_active_requests(struct intel_engine_cs *engine,

Might be worth moving this to the execlists submission file, to be symmetrical with the GuC submission one. Other execlists functions are in this file though, so not a blocker.

+                                         struct i915_request *hung_rq,
+                                         struct drm_printer *m)
  {
+       unsigned long flags;
+
+       spin_lock_irqsave(&engine->sched_engine->lock, flags);
+
+       intel_engine_dump_active_requests(&engine->sched_engine->requests, 
hung_rq, m);
+
+       drm_printf(m, "\tOn hold?: %lu\n",
+                  list_count(&engine->sched_engine->hold));
+
+       spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
+}
+
+static void engine_dump_active_requests(struct intel_engine_cs *engine,
+                                       struct drm_printer *m)
+{
+       struct intel_context *hung_ce = NULL;
        struct i915_request *hung_rq = NULL;
-       struct intel_context *ce;
-       bool guc;
/*
         * No need for an engine->irq_seqno_barrier() before the seqno reads.
@@ -2229,31 +2245,20 @@ static void engine_dump_active_requests(struct 
intel_engine_cs *engine, struct d
         * But the intention here is just to report an instantaneous snapshot
         * so that's fine.
         */
-       lockdep_assert_held(&engine->sched_engine->lock);
+       intel_get_hung_entity(engine, &hung_ce, &hung_rq);
drm_printf(m, "\tRequests:\n"); - guc = intel_uc_uses_guc_submission(&engine->gt->uc);
-       if (guc) {
-               ce = intel_engine_get_hung_context(engine);
-               if (ce) {
-                       /* This will reference count the request (if found) */
-                       hung_rq = intel_context_find_active_request(ce);
-               }
-       } else {
-               hung_rq = intel_engine_execlist_find_hung_request(engine);
-               if (hung_rq)
-                       hung_rq = i915_request_get_rcu(hung_rq);
-       }
-
        if (hung_rq)
                engine_dump_request(hung_rq, m, "\t\thung");
+       else if (hung_ce)
+               drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
- if (guc)
+       if (intel_uc_uses_guc_submission(&engine->gt->uc))
                intel_guc_dump_active_requests(engine, hung_rq, m);
        else
-               
intel_engine_dump_active_requests(&engine->sched_engine->requests,
-                                                 hung_rq, m);
+               execlist_dump_active_requests(engine, hung_rq, m);
+
        if (hung_rq)
                i915_request_put(hung_rq);
  }
@@ -2265,7 +2270,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
        struct i915_gpu_error * const error = &engine->i915->gpu_error;
        struct i915_request *rq;
        intel_wakeref_t wakeref;
-       unsigned long flags;
        ktime_t dummy;
if (header) {
@@ -2302,13 +2306,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
                   i915_reset_count(error));
        print_properties(engine, m);
- spin_lock_irqsave(&engine->sched_engine->lock, flags);
        engine_dump_active_requests(engine, m);
- drm_printf(m, "\tOn hold?: %lu\n",
-                  list_count(&engine->sched_engine->hold));

This print moves from the global function to the execlists submission one. AFAICS this is only used in execlists mode so not an issue, but a note in the commit message would've been nice.

-       spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
-
        drm_printf(m, "\tMMIO base:  0x%08x\n", engine->mmio_base);
        wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
        if (wakeref) {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 7ea36478ee52d..78cf95e4dd230 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1596,36 +1596,15 @@ capture_engine(struct intel_engine_cs *engine,
  {
        struct intel_engine_capture_vma *capture = NULL;
        struct intel_engine_coredump *ee;
-       struct intel_context *ce;
+       struct intel_context *ce = NULL;
        struct i915_request *rq = NULL;
-       unsigned long flags;
ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
        if (!ee)
                return NULL;
- ce = intel_engine_get_hung_context(engine);
-       if (ce) {
-               intel_engine_clear_hung_context(engine);
-               /* This will reference count the request (if found) */
-               rq = intel_context_find_active_request(ce);
-               if (!rq || !i915_request_started(rq))
-                       goto no_request_capture;
-       } else {
-               /*
-                * Getting here with GuC enabled means it is a forced error 
capture
-                * with no actual hang. So, no need to attempt the execlist 
search.
-                */
-               if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
-                       spin_lock_irqsave(&engine->sched_engine->lock, flags);
-                       rq = intel_engine_execlist_find_hung_request(engine);
-                       if (rq)
-                               rq = i915_request_get_rcu(rq);
-                       spin_unlock_irqrestore(&engine->sched_engine->lock,
-                                              flags);
-               }
-       }
-       if (!rq)
+       intel_get_hung_entity(engine, &ce, &rq);
+       if (!rq || !i915_request_started(rq))
                goto no_request_capture;

This has been a pain to review, but AFAICS all the locking is correct, so with the get_hung function moved to the intel_engine file this is:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>

capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);

Reply via email to