On Thu, Nov 26, 2015 at 11:51:09AM +0100, Daniel Vetter wrote:
> My apologies to Chris Wilson for being dense on this topic for
> essentially for years.
> 
> This patch doesn't do any big redesign of the overall reset flow, but
> instead just applies changes where it's needed to obey gem_eio. We can
> redesign later on, but for now I prefer to make the testcase happy
> with some minimally invasive changes:
> 
> - Ensure that set_domain_ioctl never returns -EIO. Tricky part there
>   is that we might race with the reset handler when doing the lockless
>   wait.
> 
> - Make sure debugfs/i915_wedged is actually useful to reset a wedged
>   gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
>   That's because reset_in_progress actually includes that terminal
>   state, which is super-confusing (and most callers got this wrong).
>   Fix this by changing the semantics of reset_in_progress to do what
>   it says on the tin (and fixup all other callers).
> 
>   Also make sure that reset_in_progress is cleared when we reach the
>   terminal "wedged" state. Without this we kept returning -EAGAIN in
>   some places forever.
> 
> - Second problem with debugfs/i915_wedge was that we never got out of
>   the terminal wedged state. Hence manually set the reset counter out
>   of that state before starting the hung gpu recovery.
> 
>   For safety also make sure that we are consintent with resetting the
>   gpu between i915_reset_and_wakeup and i915_handler_error by just
>   passing the boolean "wedged" around instead of trying to recompute
>   it wrongly. This isn't an issue for gem_eio with the debugfs fix,
>   but just general robustness of the code.
> 
> - Finally make sure that when gpu reset is disabled through the module
>   paramter the kernel doesn't generate dmesg noise that would upset
>   our CI/piglit. Simplest solution for that was to lift the i915.reset
>   check up into i915_reset().
> 
> With all these changes, plus the igt fixes I've posted, gem_eio is now
> happy on many snb.
> 
> v2: Don't upset lockdep with my set_domain_ioctl changes.

Blergh, forgotten to update the commit message:

v3: Don't special case set_domain_ioctl, instead curb all -EIO at the
source in wait_request, like in Chris' patch. That means anyone waiting
for a request will not notice the -EIO and fall over due to that. We
definitely want that for modeset code.

> Cc: Chris Wilson <[email protected]>
> Testcase: igt/gem_eio/*
> Signed-off-by: Daniel Vetter <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |  4 ++++
>  drivers/gpu/drm/i915/i915_drv.c     |  6 ++++++
>  drivers/gpu/drm/i915/i915_drv.h     |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c     | 23 +++++++++++------------
>  drivers/gpu/drm/i915/i915_irq.c     | 11 ++++++-----
>  drivers/gpu/drm/i915/intel_uncore.c |  3 ---
>  6 files changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 411a9c68b4ee..c4006c09ef1b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val)
>       if (i915_reset_in_progress(&dev_priv->gpu_error))
>               return -EAGAIN;
>  
> +     /* Already wedged, force us out of this terminal state. */
> +     if (i915_terminally_wedged(&dev_priv->gpu_error))
> +             atomic_set(&dev_priv->gpu_error.reset_counter, 0);
> +
>       intel_runtime_pm_get(dev_priv);
>  
>       i915_handle_error(dev, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ec1e877c4a36..1f5386bb78f4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev)
>  
>       simulated = dev_priv->gpu_error.stop_rings != 0;
>  
> +     if (!i915.reset) {
> +             DRM_INFO("GPU reset disabled in module option, not 
> resetting\n");
> +             mutex_unlock(&dev->struct_mutex);
> +             return -ENODEV;
> +     }
> +
>       ret = intel_gpu_reset(dev);
>  
>       /* Also reset the gpu hangman. */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a47e0f4fab56..8876b4891d56 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct 
> i915_gpu_error *error,
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
>  {
>       return unlikely(atomic_read(&error->reset_counter)
> -                     & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> +                     & I915_RESET_IN_PROGRESS_FLAG);
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f10a5d57377e..64c63409b9d0 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  {
>       int ret;
>  
> -#define EXIT_COND (!i915_reset_in_progress(error) || \
> -                i915_terminally_wedged(error))
> +#define EXIT_COND (!i915_reset_in_progress(error))
>       if (EXIT_COND)
>               return 0;
>  
> @@ -1113,16 +1112,16 @@ int
>  i915_gem_check_wedge(struct i915_gpu_error *error,
>                    bool interruptible)
>  {
> +     /* Recovery complete, but the reset failed ... */
> +     if (i915_terminally_wedged(error))
> +             return -EIO;
> +
>       if (i915_reset_in_progress(error)) {
>               /* Non-interruptible callers can't handle -EAGAIN, hence return
>                * -EIO unconditionally for these. */
>               if (!interruptible)
>                       return -EIO;
>  
> -             /* Recovery complete, but the reset failed ... */
> -             if (i915_terminally_wedged(error))
> -                     return -EIO;
> -
>               /*
>                * Check if GPU Reset is in progress - we need intel_ring_begin
>                * to work properly to reinit the hw state while the gpu is
> @@ -1239,11 +1238,7 @@ int __i915_wait_request(struct drm_i915_gem_request 
> *req,
>               /* We need to check whether any gpu reset happened in between
>                * the caller grabbing the seqno and now ... */
>               if (reset_counter != 
> atomic_read(&dev_priv->gpu_error.reset_counter)) {
> -                     /* ... but upgrade the -EAGAIN to an -EIO if the gpu
> -                      * is truely gone. */
> -                     ret = i915_gem_check_wedge(&dev_priv->gpu_error, 
> interruptible);
> -                     if (ret == 0)
> -                             ret = -EAGAIN;
> +                     ret = -EAGAIN;
>                       break;
>               }
>  
> @@ -1577,7 +1572,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void 
> *data,
>  
>       ret = i915_mutex_lock_interruptible(dev);
>       if (ret)
> -             return ret;
> +             goto out;
>  
>       obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
>       if (&obj->base == NULL) {
> @@ -1609,6 +1604,10 @@ unref:
>       drm_gem_object_unreference(&obj->base);
>  unlock:
>       mutex_unlock(&dev->struct_mutex);
> +out:
> +     /* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
> +     WARN_ON(ret == -EIO);
> +
>       return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c8ba94968aaf..dbbc6159584b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private 
> *dev_priv,
>   * Fire an error uevent so userspace can see that a hang or error
>   * was detected.
>   */
> -static void i915_reset_and_wakeup(struct drm_device *dev)
> +static void i915_reset_and_wakeup(struct drm_device *dev,
> +                               bool wedged)
>  {
>       struct drm_i915_private *dev_priv = to_i915(dev);
>       struct i915_gpu_error *error = &dev_priv->gpu_error;
> @@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device 
> *dev)
>        * the reset in-progress bit is only ever set by code outside of this
>        * work we don't need to worry about any other races.
>        */
> -     if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
> +     if (wedged) {
>               DRM_DEBUG_DRIVER("resetting chip\n");
>               kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
>                                  reset_event);
> @@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device 
> *dev)
>                * reference held, for example because there is a pending GPU
>                * request that won't finish until the reset is done. This
>                * isn't the case at least when we get here by doing a
> -              * simulated reset via debugs, so get an RPM reference.
> +              * simulated reset via debugfs, so get an RPM reference.
>                */
>               intel_runtime_pm_get(dev_priv);
>  
> @@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device 
> *dev)
>                       kobject_uevent_env(&dev->primary->kdev->kobj,
>                                          KOBJ_CHANGE, reset_done_event);
>               } else {
> -                     atomic_or(I915_WEDGED, &error->reset_counter);
> +                     atomic_set(&error->reset_counter, I915_WEDGED);
>               }
>  
>               /*
> @@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool 
> wedged,
>               i915_error_wake_up(dev_priv, false);
>       }
>  
> -     i915_reset_and_wakeup(dev);
> +     i915_reset_and_wakeup(dev, wedged);
>  }
>  
>  /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
> b/drivers/gpu/drm/i915/intel_uncore.c
> index c2358ba78b30..4c5ae1154dd1 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1543,9 +1543,6 @@ not_ready:
>  
>  static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device 
> *)
>  {
> -     if (!i915.reset)
> -             return NULL;
> -
>       if (INTEL_INFO(dev)->gen >= 8)
>               return gen8_do_reset;
>       else if (INTEL_INFO(dev)->gen >= 6)
> -- 
> 2.1.0
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to