Re: [PATCH 3/3] drm/i915/guc: Track all sent actions to GuC

2023-05-30 Thread John Harrison

On 5/26/2023 16:55, john.c.harri...@intel.com wrote:

From: Michal Wajdeczko 

For easier debug of any unexpected error responses from GuC that
might be related to non-blocking fast requests, track action code (and
stack if under DEBUG_GUC config) for every H2G request.

Signed-off-by: Michal Wajdeczko 
Signed-off-by: John Harrison 

Reviewed-by: John Harrison 


---
  drivers/gpu/drm/i915/Kconfig.debug|  1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 68 ++-
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 11 
  3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug 
b/drivers/gpu/drm/i915/Kconfig.debug
index 47e845353ffad..2d21930d55015 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -157,6 +157,7 @@ config DRM_I915_SW_FENCE_CHECK_DAG
  config DRM_I915_DEBUG_GUC
bool "Enable additional driver debugging for GuC"
depends on DRM_I915
+   select STACKDEPOT
default n
help
  Choose this option to turn on extra driver debugging that may affect
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 3a71bb582089e..4aa903be1317b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -376,6 +376,24 @@ void intel_guc_ct_disable(struct intel_guc_ct *ct)
}
  }
  
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)

+static void ct_track_lost_and_found(struct intel_guc_ct *ct, u32 fence, u32 
action)
+{
+   unsigned int lost = fence % ARRAY_SIZE(ct->requests.lost_and_found);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+   unsigned long entries[SZ_32];
+   unsigned int n;
+
+   n = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
+
+   /* May be called under spinlock, so avoid sleeping */
+   ct->requests.lost_and_found[lost].stack = stack_depot_save(entries, n, 
GFP_NOWAIT);
+#endif
+   ct->requests.lost_and_found[lost].fence = fence;
+   ct->requests.lost_and_found[lost].action = action;
+}
+#endif
+
  static u32 ct_get_next_fence(struct intel_guc_ct *ct)
  {
/* For now it's trivial */
@@ -447,6 +465,11 @@ static int ct_write(struct intel_guc_ct *ct,
}
GEM_BUG_ON(tail > size);
  
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)

+   ct_track_lost_and_found(ct, fence,
+   FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, 
action[0]));
+#endif
+
/*
 * make sure H2G buffer update and LRC tail update (if this triggering a
 * submission) are visible before updating the descriptor tail
@@ -953,6 +976,43 @@ static int ct_read(struct intel_guc_ct *ct, struct 
ct_incoming_msg **msg)
return -EPIPE;
  }
  
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)

+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+   unsigned int n;
+   char *buf = NULL;
+   bool found = false;
+
+   lockdep_assert_held(>requests.lock);
+
+   for (n = 0; n < ARRAY_SIZE(ct->requests.lost_and_found); n++) {
+   if (ct->requests.lost_and_found[n].fence != fence)
+   continue;
+   found = true;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+   buf = kmalloc(SZ_4K, GFP_NOWAIT);
+   if (buf && 
stack_depot_snprint(ct->requests.lost_and_found[n].stack,
+  buf, SZ_4K, 0)) {
+   CT_ERROR(ct, "Fence %u was used by action %#04x sent 
at\n%s",
+fence, ct->requests.lost_and_found[n].action, 
buf);
+   break;
+   }
+#endif
+   CT_ERROR(ct, "Fence %u was used by action %#04x\n",
+fence, ct->requests.lost_and_found[n].action);
+   break;
+   }
+   kfree(buf);
+   return found;
+}
+#else
+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+   return false;
+}
+#endif
+
  static int ct_handle_response(struct intel_guc_ct *ct, struct ct_incoming_msg 
*response)
  {
u32 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, response->msg[0]);
@@ -996,9 +1056,11 @@ static int ct_handle_response(struct intel_guc_ct *ct, 
struct ct_incoming_msg *r
if (!found) {
CT_ERROR(ct, "Unsolicited response message: len %u, data %#x (fence 
%u, last %u)\n",
 len, hxg[0], fence, ct->requests.last_fence);
-   list_for_each_entry(req, >requests.pending, link)
-   CT_ERROR(ct, "request %u awaits response\n",
-req->fence);
+   if (!ct_check_lost_and_found(ct, fence)) {
+   list_for_each_entry(req, >requests.pending, link)
+   CT_ERROR(ct, "request %u awaits response\n",
+req->fence);
+   }
   

[PATCH 3/3] drm/i915/guc: Track all sent actions to GuC

2023-05-26 Thread John . C . Harrison
From: Michal Wajdeczko 

For easier debug of any unexpected error responses from GuC that
might be related to non-blocking fast requests, track action code (and
stack if under DEBUG_GUC config) for every H2G request.

Signed-off-by: Michal Wajdeczko 
Signed-off-by: John Harrison 
---
 drivers/gpu/drm/i915/Kconfig.debug|  1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 68 ++-
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 11 
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug 
b/drivers/gpu/drm/i915/Kconfig.debug
index 47e845353ffad..2d21930d55015 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -157,6 +157,7 @@ config DRM_I915_SW_FENCE_CHECK_DAG
 config DRM_I915_DEBUG_GUC
bool "Enable additional driver debugging for GuC"
depends on DRM_I915
+   select STACKDEPOT
default n
help
  Choose this option to turn on extra driver debugging that may affect
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 3a71bb582089e..4aa903be1317b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -376,6 +376,24 @@ void intel_guc_ct_disable(struct intel_guc_ct *ct)
}
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+static void ct_track_lost_and_found(struct intel_guc_ct *ct, u32 fence, u32 
action)
+{
+   unsigned int lost = fence % ARRAY_SIZE(ct->requests.lost_and_found);
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+   unsigned long entries[SZ_32];
+   unsigned int n;
+
+   n = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
+
+   /* May be called under spinlock, so avoid sleeping */
+   ct->requests.lost_and_found[lost].stack = stack_depot_save(entries, n, 
GFP_NOWAIT);
+#endif
+   ct->requests.lost_and_found[lost].fence = fence;
+   ct->requests.lost_and_found[lost].action = action;
+}
+#endif
+
 static u32 ct_get_next_fence(struct intel_guc_ct *ct)
 {
/* For now it's trivial */
@@ -447,6 +465,11 @@ static int ct_write(struct intel_guc_ct *ct,
}
GEM_BUG_ON(tail > size);
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+   ct_track_lost_and_found(ct, fence,
+   FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, 
action[0]));
+#endif
+
/*
 * make sure H2G buffer update and LRC tail update (if this triggering a
 * submission) are visible before updating the descriptor tail
@@ -953,6 +976,43 @@ static int ct_read(struct intel_guc_ct *ct, struct 
ct_incoming_msg **msg)
return -EPIPE;
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+   unsigned int n;
+   char *buf = NULL;
+   bool found = false;
+
+   lockdep_assert_held(>requests.lock);
+
+   for (n = 0; n < ARRAY_SIZE(ct->requests.lost_and_found); n++) {
+   if (ct->requests.lost_and_found[n].fence != fence)
+   continue;
+   found = true;
+
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
+   buf = kmalloc(SZ_4K, GFP_NOWAIT);
+   if (buf && 
stack_depot_snprint(ct->requests.lost_and_found[n].stack,
+  buf, SZ_4K, 0)) {
+   CT_ERROR(ct, "Fence %u was used by action %#04x sent 
at\n%s",
+fence, ct->requests.lost_and_found[n].action, 
buf);
+   break;
+   }
+#endif
+   CT_ERROR(ct, "Fence %u was used by action %#04x\n",
+fence, ct->requests.lost_and_found[n].action);
+   break;
+   }
+   kfree(buf);
+   return found;
+}
+#else
+static bool ct_check_lost_and_found(struct intel_guc_ct *ct, u32 fence)
+{
+   return false;
+}
+#endif
+
 static int ct_handle_response(struct intel_guc_ct *ct, struct ct_incoming_msg 
*response)
 {
u32 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, response->msg[0]);
@@ -996,9 +1056,11 @@ static int ct_handle_response(struct intel_guc_ct *ct, 
struct ct_incoming_msg *r
if (!found) {
CT_ERROR(ct, "Unsolicited response message: len %u, data %#x 
(fence %u, last %u)\n",
 len, hxg[0], fence, ct->requests.last_fence);
-   list_for_each_entry(req, >requests.pending, link)
-   CT_ERROR(ct, "request %u awaits response\n",
-req->fence);
+   if (!ct_check_lost_and_found(ct, fence)) {
+   list_for_each_entry(req, >requests.pending, link)
+   CT_ERROR(ct, "request %u awaits response\n",
+req->fence);
+   }
err = -ENOKEY;
}
spin_unlock_irqrestore(>requests.lock, flags);
diff --git