On 5/26/2023 16:55, john.c.harri...@intel.com wrote:
From: Michal Wajdeczko <michal.wajdec...@intel.com>

For easier debug of any unexpected error responses from GuC that
might be related to non-blocking fast requests, track action code (and
stack if under DEBUG_GUC config) for every H2G request.

Signed-off-by: Michal Wajdeczko <michal.wajdec...@intel.com>
Signed-off-by: John Harrison <john.c.harri...@intel.com>
Reviewed-by: John Harrison <john.c.harri...@intel.com>

---
  drivers/gpu/drm/i915/Kconfig.debug        |  1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 68 ++++++++++++++++++++++-
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 11 ++++
  3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug 
b/drivers/gpu/drm/i915/Kconfig.debug
index 47e845353ffad..2d21930d55015 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -157,6 +157,7 @@ config DRM_I915_SW_FENCE_CHECK_DAG
  config DRM_I915_DEBUG_GUC
        bool "Enable additional driver debugging for GuC"
        depends on DRM_I915
+       select STACKDEPOT
        default n
        help
          Choose this option to turn on extra driver debugging that may affect
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 3a71bb582089e..4aa903be1317b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -376,6 +376,24 @@ void intel_guc_ct_disable(struct intel_guc_ct *ct)
        }
  }
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+static void ct_track_lost_and_found(struct intel_guc_ct *ct, u32 fence, u32 
action)
+{
+       unsigned int lost = fence % ARRAY_SIZE(ct->requests.lost_and_found);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+       unsigned long entries[SZ_32];
+       unsigned int n;
+
+       n = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
+
+       /* May be called under spinlock, so avoid sleeping */
+       ct->requests.lost_and_found[lost].stack = stack_depot_save(entries, n, 
GFP_NOWAIT);
+#endif
+       ct->requests.lost_and_found[lost].fence = fence;
+       ct->requests.lost_and_found[lost].action = action;
+}
+#endif
+
  static u32 ct_get_next_fence(struct intel_guc_ct *ct)
  {
        /* For now it's trivial */
@@ -447,6 +465,11 @@ static int ct_write(struct intel_guc_ct *ct,
        }
        GEM_BUG_ON(tail > size);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+       ct_track_lost_and_found(ct, fence,
+                               FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, 
action[0]));
+#endif
+
        /*
         * make sure H2G buffer update and LRC tail update (if this triggering a
         * submission) are visible before updating the descriptor tail
@@ -953,6 +976,43 @@ static int ct_read(struct intel_guc_ct *ct, struct 
ct_incoming_msg **msg)
        return -EPIPE;
  }
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+       unsigned int n;
+       char *buf = NULL;
+       bool found = false;
+
+       lockdep_assert_held(&ct->requests.lock);
+
+       for (n = 0; n < ARRAY_SIZE(ct->requests.lost_and_found); n++) {
+               if (ct->requests.lost_and_found[n].fence != fence)
+                       continue;
+               found = true;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+               buf = kmalloc(SZ_4K, GFP_NOWAIT);
+               if (buf && 
stack_depot_snprint(ct->requests.lost_and_found[n].stack,
+                                              buf, SZ_4K, 0)) {
+                       CT_ERROR(ct, "Fence %u was used by action %#04x sent 
at\n%s",
+                                fence, ct->requests.lost_and_found[n].action, 
buf);
+                       break;
+               }
+#endif
+               CT_ERROR(ct, "Fence %u was used by action %#04x\n",
+                        fence, ct->requests.lost_and_found[n].action);
+               break;
+       }
+       kfree(buf);
+       return found;
+}
+#else
+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+       return false;
+}
+#endif
+
  static int ct_handle_response(struct intel_guc_ct *ct, struct ct_incoming_msg 
*response)
  {
        u32 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, response->msg[0]);
@@ -996,9 +1056,11 @@ static int ct_handle_response(struct intel_guc_ct *ct, 
struct ct_incoming_msg *r
        if (!found) {
                CT_ERROR(ct, "Unsolicited response message: len %u, data %#x (fence 
%u, last %u)\n",
                         len, hxg[0], fence, ct->requests.last_fence);
-               list_for_each_entry(req, &ct->requests.pending, link)
-                       CT_ERROR(ct, "request %u awaits response\n",
-                                req->fence);
+               if (!ct_check_lost_and_found(ct, fence)) {
+                       list_for_each_entry(req, &ct->requests.pending, link)
+                               CT_ERROR(ct, "request %u awaits response\n",
+                                        req->fence);
+               }
                err = -ENOKEY;
        }
        spin_unlock_irqrestore(&ct->requests.lock, flags);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
index 818415b64f4d1..58e42901ff498 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -8,6 +8,7 @@
#include <linux/interrupt.h>
  #include <linux/spinlock.h>
+#include <linux/stackdepot.h>
  #include <linux/workqueue.h>
  #include <linux/ktime.h>
  #include <linux/wait.h>
@@ -81,6 +82,16 @@ struct intel_guc_ct {
struct list_head incoming; /* incoming requests */
                struct work_struct worker; /* handler for incoming requests */
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+               struct {
+                       u16 fence;
+                       u16 action;
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+                       depot_stack_handle_t stack;
+#endif
+               } lost_and_found[SZ_16];
+#endif
        } requests;
/** @stall_time: time of first time a CTB submission is stalled */

Reply via email to