On Fri, Oct 09, 2015 at 01:21:45PM +0100, Chris Wilson wrote:
> The error state is purposefully racy as we expect it to be called at any
> time and so have avoided any locking whilst capturing the crash dump.
> However, with multi-engine GPUs and multiple CPUs, those races can
> manifest into OOPSes as we attempt to chase dangling pointers freed on
> other CPUs. Under discussion are lots of ways to slow down normal
> operation in order to protect the post-mortem error capture, but what it
> we take the opposite approach and freeze the machine whilst the error
> catpure runs (note the GPU may still running, but as long as we don't
> process any of the results the driver's bookkeeping will be static).
> 
> Signed-off-by: Chris Wilson <[email protected]>

One risk I see is that the list walking might still go off the rails when
we stop the machine right in the middle of a list_move. With that we might
start scanning the active list (of objects) on one engine and then midway
through get to another engine and so never see the list_head again,
looping forever. No idea yet what to do with that.

Intriguing approach nevertheless.
-Daniel

> ---
>  drivers/gpu/drm/i915/Kconfig          |  1 +
>  drivers/gpu/drm/i915/i915_drv.h       |  2 ++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 31 +++++++++++++++++++++----------
>  3 files changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 99e819767204..63df28910c8f 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -7,6 +7,7 @@ config DRM_I915
>       select AGP_INTEL if AGP
>       select INTERVAL_TREE
>       select ZLIB_DEFLATE
> +     select STOP_MACHINE
>       # we need shmfs for the swappable backing store, and in particular
>       # the shmem_readpage() which depends upon tmpfs
>       select SHMEM
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 9d16fc1189d6..14a882fe486c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -488,6 +488,8 @@ struct drm_i915_error_state {
>       struct kref ref;
>       struct timeval time;
>  
> +     struct drm_i915_private *i915;
> +
>       char error_msg[128];
>       int iommu;
>       u32 reset_count;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 64bdffcffb50..29cbec67bcfc 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -29,6 +29,7 @@
>  
>  #include <generated/utsrelease.h>
>  #include <linux/zlib.h>
> +#include <linux/stop_machine.h>
>  #include "i915_drv.h"
>  
>  static const char *yesno(int v)
> @@ -1352,6 +1353,24 @@ static void i915_capture_gen_state(struct 
> drm_i915_private *dev_priv,
>       error->suspend_count = dev_priv->suspend_count;
>  }
>  
> +static int capture(void *data)
> +{
> +     struct drm_i915_error_state *error = data;
> +
> +     i915_capture_gen_state(error->i915, error);
> +     i915_capture_reg_state(error->i915, error);
> +     i915_gem_capture_buffers(error->i915, error);
> +     i915_gem_record_fences(error->i915->dev, error);
> +     i915_gem_record_rings(error->i915->dev, error);
> +
> +     do_gettimeofday(&error->time);
> +
> +     error->overlay = intel_overlay_capture_error_state(error->i915->dev);
> +     error->display = intel_display_capture_error_state(error->i915->dev);
> +
> +     return 0;
> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1377,17 +1396,9 @@ void i915_capture_error_state(struct drm_device *dev, 
> bool wedged,
>       }
>  
>       kref_init(&error->ref);
> +     error->i915 = dev_priv;
>  
> -     i915_capture_gen_state(dev_priv, error);
> -     i915_capture_reg_state(dev_priv, error);
> -     i915_gem_capture_buffers(dev_priv, error);
> -     i915_gem_record_fences(dev, error);
> -     i915_gem_record_rings(dev, error);
> -
> -     do_gettimeofday(&error->time);
> -
> -     error->overlay = intel_overlay_capture_error_state(dev);
> -     error->display = intel_display_capture_error_state(dev);
> +     stop_machine(capture, error, NULL);
>  
>       i915_error_capture_msg(dev, error, wedged, error_msg);
>       DRM_INFO("%s\n", error->error_msg);
> -- 
> 2.6.1
> 
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to