[Intel-gfx] [PATCH 15/43] drm/i915: add interface to simulate gpu hangs

2011-12-14 Thread Daniel Vetter
gpu reset is a very important piece of our infrastructure.
Unfortunately we only really it test by actually hanging the gpu,
which often has bad side-effects for the entire system. And the gpu
hang handling code is one of the rather complicated pieces of code we
have, consisting of
- hang detection
- error capture
- actual gpu reset
- reset of all the gem bookkeeping
- reinitialition of the entire gpu

This patch adds a debugfs to selectively stopping rings by ceasing to
update the hw tail pointer, which will result in the gpu no longer
updating it's head pointer and eventually to the hangcheck firing.
This way we can exercise the gpu hang code under controlled conditions
without a dying gpu taking down the entire systems.

Patch motivated by me forgetting to properly reinitialize ppgtt after
a gpu reset.

Usage:

echo $((1  $ringnum))  i915_ring_stop # stops one ring

echo 0x  i915_ring_stop # stops all, future-proof version

then run whatever testload is desired. i915_ring_stop automatically
resets after a gpu hang is detected to avoid hanging the gpu to fast
and declaring it wedged.

v2: Incorporate feedback from Chris Wilson.

v3: Add the missing cleanup.

Signed-Off-by: Daniel Vetter daniel.vet...@ffwll.ch
Reviewed-by: Chris Wilson ch...@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_debugfs.c |   65 +++
 drivers/gpu/drm/i915/i915_drv.c |2 +
 drivers/gpu/drm/i915/i915_drv.h |2 +
 drivers/gpu/drm/i915/intel_ringbuffer.c |4 ++
 4 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index db83552..67d7567 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1397,6 +1397,64 @@ static const struct file_operations i915_wedged_fops = {
 };
 
 static ssize_t
+i915_ring_stop_read(struct file *filp,
+   char __user *ubuf,
+   size_t max,
+   loff_t *ppos)
+{
+   struct drm_device *dev = filp-private_data;
+   drm_i915_private_t *dev_priv = dev-dev_private;
+   char buf[80];
+   int len;
+
+   len = snprintf(buf, sizeof(buf),
+  0x%08x\n, dev_priv-stop_rings);
+
+   if (len  sizeof(buf))
+   len = sizeof(buf);
+
+   return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static ssize_t
+i915_ring_stop_write(struct file *filp,
+const char __user *ubuf,
+size_t cnt,
+loff_t *ppos)
+{
+   struct drm_device *dev = filp-private_data;
+   struct drm_i915_private *dev_priv = dev-dev_private;
+   char buf[20];
+   int val = 0;
+
+   if (cnt  0) {
+   if (cnt  sizeof(buf) - 1)
+   return -EINVAL;
+
+   if (copy_from_user(buf, ubuf, cnt))
+   return -EFAULT;
+   buf[cnt] = 0;
+
+   val = simple_strtoul(buf, NULL, 0);
+   }
+
+   DRM_DEBUG_DRIVER(Stopping rings 0x%08x\n, val);
+
+   mutex_lock(dev-struct_mutex);
+   dev_priv-stop_rings = val;
+   mutex_unlock(dev-struct_mutex);
+
+   return cnt;
+}
+
+static const struct file_operations i915_ring_stop_fops = {
+   .owner = THIS_MODULE,
+   .open = i915_debugfs_common_open,
+   .read = i915_ring_stop_read,
+   .write = i915_ring_stop_write,
+   .llseek = default_llseek,
+};
+static ssize_t
 i915_max_freq_read(struct file *filp,
   char __user *ubuf,
   size_t max,
@@ -1701,6 +1759,11 @@ int i915_debugfs_init(struct drm_minor *minor)
  i915_cache_sharing_fops);
if (ret)
return ret;
+   ret = i915_debugfs_create(minor-debugfs_root, minor,
+ i915_ring_stop,
+ i915_ring_stop_fops);
+   if (ret)
+   return ret;
 
return drm_debugfs_create_files(i915_debugfs_list,
I915_DEBUGFS_ENTRIES,
@@ -1719,6 +1782,8 @@ void i915_debugfs_cleanup(struct drm_minor *minor)
 1, minor);
drm_debugfs_remove_files((struct drm_info_list *) 
i915_cache_sharing_fops,
 1, minor);
+   drm_debugfs_remove_files((struct drm_info_list *) i915_ring_stop_fops,
+1, minor);
 }
 
 #endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 4a2eb68..6dd219b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -638,6 +638,8 @@ int i915_reset(struct drm_device *dev, u8 flags)
if (!mutex_trylock(dev-struct_mutex))
return -EBUSY;
 
+   dev_priv-stop_rings = 0;
+
i915_gem_reset(dev);
 
ret = -ENODEV;
diff --git a/drivers/gpu/drm/i915/i915_drv.h 

Re: [Intel-gfx] [PATCH 15/43] drm/i915: add interface to simulate gpu hangs

2011-12-14 Thread Eugeni Dodonov
On Wed, Dec 14, 2011 at 10:57, Daniel Vetter daniel.vet...@ffwll.ch wrote:

 gpu reset is a very important piece of our infrastructure.
 Unfortunately we only really it test by actually hanging the gpu,
 which often has bad side-effects for the entire system. And the gpu
 hang handling code is one of the rather complicated pieces of code we
 have, consisting of
 - hang detection
 - error capture
 - actual gpu reset
 - reset of all the gem bookkeeping
 - reinitialition of the entire gpu

 This patch adds a debugfs to selectively stopping rings by ceasing to
 update the hw tail pointer, which will result in the gpu no longer
 updating it's head pointer and eventually to the hangcheck firing.
 This way we can exercise the gpu hang code under controlled conditions
 without a dying gpu taking down the entire systems.

 Patch motivated by me forgetting to properly reinitialize ppgtt after
 a gpu reset.

 Usage:

 echo $((1  $ringnum))  i915_ring_stop # stops one ring

 echo 0x  i915_ring_stop # stops all, future-proof version

 then run whatever testload is desired. i915_ring_stop automatically
 resets after a gpu hang is detected to avoid hanging the gpu to fast
 and declaring it wedged.

 v2: Incorporate feedback from Chris Wilson.

 v3: Add the missing cleanup.

 Signed-Off-by: Daniel Vetter daniel.vet...@ffwll.ch
 Reviewed-by: Chris Wilson ch...@chris-wilson.co.uk



Reviewed-by: Eugeni Dodonov eugeni.dodo...@intel.com


-- 
Eugeni Dodonov
http://eugeni.dodonov.net/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx