Re: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-05 Thread Chris Wilson
On Fri, May 05, 2017 at 08:43:36AM -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> On 04/05/17 14:31, Chris Wilson wrote:
> >On Thu, May 04, 2017 at 09:26:35PM +, Srivatsa, Anusha wrote:
> >>>+void i915_guc_load_error_log_capture(struct drm_i915_private *i915) {
> >>>+  void *log, *buf;
> >>>+  struct i915_vma *vma = i915->guc.log.vma;
> >>>+
> >>>+  if (i915->gpu_error.guc_load_fail_log || !vma)
> >>>+  return;
> >>>+
> >>>+  /*
> >>>+   * the vma should be already pinned and mapped for log runtime
> >>>+   * management but let's play safe
> >>>+   */
> >>>+  log = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
> >>>+  if (IS_ERR(log)) {
> >>>+  DRM_ERROR("Failed to pin guc_log vma\n");
> >>>+  return;
> >>>+  }
> >>>+
> >>>+  buf = kzalloc(GUC_LOG_SIZE, GFP_KERNEL);
> >>>+  if (buf) {
> >>>+  memcpy(buf, log, GUC_LOG_SIZE);
> >>>+  i915->gpu_error.guc_load_fail_log = buf;
> >>>+  } else {
> >>>+  DRM_ERROR("Failed to copy guc log\n");
> >>>+  }
> >>>+
> >>>+  i915_gem_object_unpin_map(vma->obj);
> >
> >You are trading a swappable object for unswappable kernel memory. If you
> >want to have the guc log after guc is disabled, just keep the log object
> >around.
> >-Chris
> >
> 
> I had considered that, but in the end I wasn't sure if that was
> acceptable in case we end up modifying the code to recycle the
> object for future load attempts, although that is very unlikely. I
> was however unconvinced myself of using kzalloc and that's mainly
> why this was an RFC :)
> I'll flip it to take an extra reference on the object.

Just make sure you unpin it. I would suggest killing the guc->log.vma
and taking the object and placing it somewhere else as you have for the
buf, but not i915->gpu_error as that doesn't fit (this isn't a runtime
error that we are tracking). i915->guc.last_load_log?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-05 Thread Daniele Ceraolo Spurio



On 04/05/17 14:31, Chris Wilson wrote:

On Thu, May 04, 2017 at 09:26:35PM +, Srivatsa, Anusha wrote:

+void i915_guc_load_error_log_capture(struct drm_i915_private *i915) {
+   void *log, *buf;
+   struct i915_vma *vma = i915->guc.log.vma;
+
+   if (i915->gpu_error.guc_load_fail_log || !vma)
+   return;
+
+   /*
+* the vma should be already pinned and mapped for log runtime
+* management but let's play safe
+*/
+   log = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
+   if (IS_ERR(log)) {
+   DRM_ERROR("Failed to pin guc_log vma\n");
+   return;
+   }
+
+   buf = kzalloc(GUC_LOG_SIZE, GFP_KERNEL);
+   if (buf) {
+   memcpy(buf, log, GUC_LOG_SIZE);
+   i915->gpu_error.guc_load_fail_log = buf;
+   } else {
+   DRM_ERROR("Failed to copy guc log\n");
+   }
+
+   i915_gem_object_unpin_map(vma->obj);


You are trading a swappable object for unswappable kernel memory. If you
want to have the guc log after guc is disabled, just keep the log object
around.
-Chris



I had considered that, but in the end I wasn't sure if that was 
acceptable in case we end up modifying the code to recycle the object 
for future load attempts, although that is very unlikely. I was however 
unconvinced myself of using kzalloc and that's mainly why this was an RFC :)

I'll flip it to take an extra reference on the object.

Thanks,
Daniele
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-05 Thread Daniele Ceraolo Spurio



On 04/05/17 14:26, Srivatsa, Anusha wrote:




-Original Message-
From: Intel-gfx [mailto:intel-gfx-boun...@lists.freedesktop.org] On Behalf Of
Daniele Ceraolo Spurio
Sent: Thursday, May 4, 2017 11:52 AM
To: intel-gfx@lists.freedesktop.org
Subject: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

We're currently deleting the GuC logs if the FW fails to load, but those are 
still
useful to understand why the loading failed. Instead of deleting them, taking a
snapshot allows us to access them after driver load is completed.


Hi Daniele, I like the idea. But just to confirm, we are still going to get the 
status of fetch and load-like PENDING  or FAIL, but the reason of failure is 
going to be in the debugfs. Correct?

Anusha



yes, no changes in the what comes out in dmesg. What is going to be in 
debugfs is not the reason for the failure but the GuC logs generated 
during the attempted load, which once decoded should show the reason for 
the failure.


Daniele


Cc: Oscar Mateo <oscar.ma...@intel.com>
Cc: Michal Wajdeczko <michal.wajdec...@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c   | 36 ---
drivers/gpu/drm/i915/i915_drv.c   |  3 +++
drivers/gpu/drm/i915/i915_drv.h   |  6 ++
drivers/gpu/drm/i915/i915_gpu_error.c | 36
+++
drivers/gpu/drm/i915/intel_guc_fwif.h | 14 +++---
drivers/gpu/drm/i915/intel_guc_log.c  | 10 ++
drivers/gpu/drm/i915/intel_uc.c   |  7 +--
7 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470..4ff20fc 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2543,26 +2543,32 @@ static int i915_guc_info(struct seq_file *m, void
*data)  static int i915_guc_log_dump(struct seq_file *m, void *data)  {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
-   struct drm_i915_gem_object *obj;
-   int i = 0, pg;
-
-   if (!dev_priv->guc.log.vma)
+   u32 *log;
+   int i = 0;
+
+   if (dev_priv->guc.log.vma) {
+   log = i915_gem_object_pin_map(dev_priv->guc.log.vma->obj,
+ I915_MAP_WC);
+   if (IS_ERR(log)) {
+   DRM_ERROR("Failed to pin guc_log vma\n");
+   return -ENOMEM;
+   }
+   } else if (dev_priv->gpu_error.guc_load_fail_log) {
+   log = dev_priv->gpu_error.guc_load_fail_log;
+   } else {
return 0;
-
-   obj = dev_priv->guc.log.vma->obj;
-   for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
-   u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
-
-   for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
-   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
-  *(log + i), *(log + i + 1),
-  *(log + i + 2), *(log + i + 3));
-
-   kunmap_atomic(log);
}

+   for (i = 0; i < GUC_LOG_SIZE / sizeof(u32); i += 4)
+   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
+  *(log + i), *(log + i + 1),
+  *(log + i + 2), *(log + i + 3));
+
seq_putc(m, '\n');

+   if (dev_priv->guc.log.vma)
+   i915_gem_object_unpin_map(dev_priv->guc.log.vma->obj);
+
return 0;
}

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 452c265..c7cb36c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1354,6 +1354,9 @@ void i915_driver_unload(struct drm_device *dev)
cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
i915_reset_error_state(dev_priv);

+   /* release GuC error log (if any) */
+   i915_guc_load_error_log_free(dev_priv);
+
/* Flush any outstanding unpin_work. */
drain_workqueue(dev_priv->wq);

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4588b3e..761c663 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1555,6 +1555,9 @@ struct i915_gpu_error {
/* Protected by the above dev->gpu_error.lock. */
struct i915_gpu_state *first_error;

+   /* Log snapshot if GuC errors during load */
+   void *guc_load_fail_log;
+
unsigned long missed_irq_rings;

/**
@@ -3687,6 +3690,9 @@ static inline void i915_reset_error_state(struct
drm_i915_private *i915)

#endif

+void i915_guc_load_error_log_capture(struct drm_i915_private *i915);
+void i915_guc_load_error_log_free(struct drm_i915_private *i915);
+
const char *i915_cache_level

Re: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-04 Thread Chris Wilson
On Thu, May 04, 2017 at 09:26:35PM +, Srivatsa, Anusha wrote:
> >+void i915_guc_load_error_log_capture(struct drm_i915_private *i915) {
> >+void *log, *buf;
> >+struct i915_vma *vma = i915->guc.log.vma;
> >+
> >+if (i915->gpu_error.guc_load_fail_log || !vma)
> >+return;
> >+
> >+/*
> >+ * the vma should be already pinned and mapped for log runtime
> >+ * management but let's play safe
> >+ */
> >+log = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
> >+if (IS_ERR(log)) {
> >+DRM_ERROR("Failed to pin guc_log vma\n");
> >+return;
> >+}
> >+
> >+buf = kzalloc(GUC_LOG_SIZE, GFP_KERNEL);
> >+if (buf) {
> >+memcpy(buf, log, GUC_LOG_SIZE);
> >+i915->gpu_error.guc_load_fail_log = buf;
> >+} else {
> >+DRM_ERROR("Failed to copy guc log\n");
> >+}
> >+
> >+i915_gem_object_unpin_map(vma->obj);

You are trading a swappable object for unswappable kernel memory. If you
want to have the guc log after guc is disabled, just keep the log object
around.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-04 Thread Srivatsa, Anusha


>-Original Message-
>From: Intel-gfx [mailto:intel-gfx-boun...@lists.freedesktop.org] On Behalf Of
>Daniele Ceraolo Spurio
>Sent: Thursday, May 4, 2017 11:52 AM
>To: intel-gfx@lists.freedesktop.org
>Subject: [Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load
>
>We're currently deleting the GuC logs if the FW fails to load, but those are 
>still
>useful to understand why the loading failed. Instead of deleting them, taking a
>snapshot allows us to access them after driver load is completed.

Hi Daniele, I like the idea. But just to confirm, we are still going to get the 
status of fetch and load-like PENDING  or FAIL, but the reason of failure is 
going to be in the debugfs. Correct?

Anusha

>Cc: Oscar Mateo <oscar.ma...@intel.com>
>Cc: Michal Wajdeczko <michal.wajdec...@intel.com>
>Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>
>---
> drivers/gpu/drm/i915/i915_debugfs.c   | 36 ---
> drivers/gpu/drm/i915/i915_drv.c   |  3 +++
> drivers/gpu/drm/i915/i915_drv.h   |  6 ++
> drivers/gpu/drm/i915/i915_gpu_error.c | 36
>+++
> drivers/gpu/drm/i915/intel_guc_fwif.h | 14 +++---
>drivers/gpu/drm/i915/intel_guc_log.c  | 10 ++
> drivers/gpu/drm/i915/intel_uc.c   |  7 +--
> 7 files changed, 84 insertions(+), 28 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
>b/drivers/gpu/drm/i915/i915_debugfs.c
>index 870c470..4ff20fc 100644
>--- a/drivers/gpu/drm/i915/i915_debugfs.c
>+++ b/drivers/gpu/drm/i915/i915_debugfs.c
>@@ -2543,26 +2543,32 @@ static int i915_guc_info(struct seq_file *m, void
>*data)  static int i915_guc_log_dump(struct seq_file *m, void *data)  {
>   struct drm_i915_private *dev_priv = node_to_i915(m->private);
>-  struct drm_i915_gem_object *obj;
>-  int i = 0, pg;
>-
>-  if (!dev_priv->guc.log.vma)
>+  u32 *log;
>+  int i = 0;
>+
>+  if (dev_priv->guc.log.vma) {
>+  log = i915_gem_object_pin_map(dev_priv->guc.log.vma->obj,
>+I915_MAP_WC);
>+  if (IS_ERR(log)) {
>+  DRM_ERROR("Failed to pin guc_log vma\n");
>+  return -ENOMEM;
>+  }
>+  } else if (dev_priv->gpu_error.guc_load_fail_log) {
>+  log = dev_priv->gpu_error.guc_load_fail_log;
>+  } else {
>   return 0;
>-
>-  obj = dev_priv->guc.log.vma->obj;
>-  for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
>-  u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
>-
>-  for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
>-  seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
>- *(log + i), *(log + i + 1),
>- *(log + i + 2), *(log + i + 3));
>-
>-  kunmap_atomic(log);
>   }
>
>+  for (i = 0; i < GUC_LOG_SIZE / sizeof(u32); i += 4)
>+  seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
>+ *(log + i), *(log + i + 1),
>+ *(log + i + 2), *(log + i + 3));
>+
>   seq_putc(m, '\n');
>
>+  if (dev_priv->guc.log.vma)
>+  i915_gem_object_unpin_map(dev_priv->guc.log.vma->obj);
>+
>   return 0;
> }
>
>diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>index 452c265..c7cb36c 100644
>--- a/drivers/gpu/drm/i915/i915_drv.c
>+++ b/drivers/gpu/drm/i915/i915_drv.c
>@@ -1354,6 +1354,9 @@ void i915_driver_unload(struct drm_device *dev)
>   cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
>   i915_reset_error_state(dev_priv);
>
>+  /* release GuC error log (if any) */
>+  i915_guc_load_error_log_free(dev_priv);
>+
>   /* Flush any outstanding unpin_work. */
>   drain_workqueue(dev_priv->wq);
>
>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>index 4588b3e..761c663 100644
>--- a/drivers/gpu/drm/i915/i915_drv.h
>+++ b/drivers/gpu/drm/i915/i915_drv.h
>@@ -1555,6 +1555,9 @@ struct i915_gpu_error {
>   /* Protected by the above dev->gpu_error.lock. */
>   struct i915_gpu_state *first_error;
>
>+  /* Log snapshot if GuC errors during load */
>+  void *guc_load_fail_log;
>+
>   unsigned long missed_irq_rings;
>
>   /**
>@@ -3687,6 +3690,9 @@ static inline void i915_reset_error_state(struct
>drm_i915_private *i915)
>
> #endif
>
>+void i915_guc_load_error_log_capture(struct drm_i

[Intel-gfx] [RFC] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-04 Thread Daniele Ceraolo Spurio
We're currently deleting the GuC logs if the FW fails to load, but those
are still useful to understand why the loading failed. Instead of
deleting them, taking a snapshot allows us to access them after driver
load is completed.

Cc: Oscar Mateo 
Cc: Michal Wajdeczko 
Signed-off-by: Daniele Ceraolo Spurio 
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 36 ---
 drivers/gpu/drm/i915/i915_drv.c   |  3 +++
 drivers/gpu/drm/i915/i915_drv.h   |  6 ++
 drivers/gpu/drm/i915/i915_gpu_error.c | 36 +++
 drivers/gpu/drm/i915/intel_guc_fwif.h | 14 +++---
 drivers/gpu/drm/i915/intel_guc_log.c  | 10 ++
 drivers/gpu/drm/i915/intel_uc.c   |  7 +--
 7 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470..4ff20fc 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2543,26 +2543,32 @@ static int i915_guc_info(struct seq_file *m, void *data)
 static int i915_guc_log_dump(struct seq_file *m, void *data)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
-   struct drm_i915_gem_object *obj;
-   int i = 0, pg;
-
-   if (!dev_priv->guc.log.vma)
+   u32 *log;
+   int i = 0;
+
+   if (dev_priv->guc.log.vma) {
+   log = i915_gem_object_pin_map(dev_priv->guc.log.vma->obj,
+ I915_MAP_WC);
+   if (IS_ERR(log)) {
+   DRM_ERROR("Failed to pin guc_log vma\n");
+   return -ENOMEM;
+   }
+   } else if (dev_priv->gpu_error.guc_load_fail_log) {
+   log = dev_priv->gpu_error.guc_load_fail_log;
+   } else {
return 0;
-
-   obj = dev_priv->guc.log.vma->obj;
-   for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
-   u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
-
-   for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
-   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
-  *(log + i), *(log + i + 1),
-  *(log + i + 2), *(log + i + 3));
-
-   kunmap_atomic(log);
}
 
+   for (i = 0; i < GUC_LOG_SIZE / sizeof(u32); i += 4)
+   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
+  *(log + i), *(log + i + 1),
+  *(log + i + 2), *(log + i + 3));
+
seq_putc(m, '\n');
 
+   if (dev_priv->guc.log.vma)
+   i915_gem_object_unpin_map(dev_priv->guc.log.vma->obj);
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 452c265..c7cb36c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1354,6 +1354,9 @@ void i915_driver_unload(struct drm_device *dev)
cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
i915_reset_error_state(dev_priv);
 
+   /* release GuC error log (if any) */
+   i915_guc_load_error_log_free(dev_priv);
+
/* Flush any outstanding unpin_work. */
drain_workqueue(dev_priv->wq);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4588b3e..761c663 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1555,6 +1555,9 @@ struct i915_gpu_error {
/* Protected by the above dev->gpu_error.lock. */
struct i915_gpu_state *first_error;
 
+   /* Log snapshot if GuC errors during load */
+   void *guc_load_fail_log;
+
unsigned long missed_irq_rings;
 
/**
@@ -3687,6 +3690,9 @@ static inline void i915_reset_error_state(struct 
drm_i915_private *i915)
 
 #endif
 
+void i915_guc_load_error_log_capture(struct drm_i915_private *i915);
+void i915_guc_load_error_log_free(struct drm_i915_private *i915);
+
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 
 /* i915_cmd_parser.c */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index ec526d9..44a873b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1809,3 +1809,39 @@ void i915_reset_error_state(struct drm_i915_private 
*i915)
 
i915_gpu_state_put(error);
 }
+
+void i915_guc_load_error_log_capture(struct drm_i915_private *i915)
+{
+   void *log, *buf;
+   struct i915_vma *vma = i915->guc.log.vma;
+
+   if (i915->gpu_error.guc_load_fail_log || !vma)
+   return;
+
+   /*
+* the vma should be already pinned and mapped for log runtime
+* management but let's play safe
+*/
+   log = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
+   if (IS_ERR(log)) {
+