Chris Wilson <[email protected]> writes:

> I915_RESET_IN_PROGRESS is being used for both signaling the requirement
> to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
> to instruct a waiter (already holding the struct_mutex) to perform the
> reset. To allow for a little more coordination, split these two meaning
> into a couple of distinct flags. I915_RESET_BACKOFF tells
> i915_mutex_lock_interruptible() not to acquire the mutex and
> I915_RESET_HANDOFF tells the waiter to call i915_reset().
>
> Signed-off-by: Chris Wilson <[email protected]>

+ Acked-by: Michel Thierry <[email protected]>

from earlier version.
-Mika


> ---
>  drivers/gpu/drm/i915/i915_debugfs.c              | 16 +++++-----
>  drivers/gpu/drm/i915/i915_drv.c                  |  7 +++--
>  drivers/gpu/drm/i915/i915_drv.h                  | 40 
> +++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_gem.c                  |  5 +--
>  drivers/gpu/drm/i915/i915_gem_request.c          |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c                  | 38 ++++------------------
>  drivers/gpu/drm/i915/intel_display.c             |  4 +--
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++----
>  8 files changed, 71 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9db6b041a799..5fdf8c137235 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, 
> void *unused)
>       enum intel_engine_id id;
>  
>       if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> -             seq_printf(m, "Wedged\n");
> -     if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
> -             seq_printf(m, "Reset in progress\n");
> +             seq_puts(m, "Wedged\n");
> +     if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
> +             seq_puts(m, "Reset in progress: struct_mutex backoff\n");
> +     if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
> +             seq_puts(m, "Reset in progress: reset handoff to waiter\n");
>       if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
> -             seq_printf(m, "Waiter holding struct mutex\n");
> +             seq_puts(m, "Waiter holding struct mutex\n");
>       if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
> -             seq_printf(m, "struct_mutex blocked for reset\n");
> +             seq_puts(m, "struct_mutex blocked for reset\n");
>  
>       if (!i915.enable_hangcheck) {
> -             seq_printf(m, "Hangcheck disabled\n");
> +             seq_puts(m, "Hangcheck disabled\n");
>               return 0;
>       }
>  
> @@ -4121,7 +4123,7 @@ i915_wedged_set(void *data, u64 val)
>        * while it is writing to 'i915_wedged'
>        */
>  
> -     if (i915_reset_in_progress(&dev_priv->gpu_error))
> +     if (i915_reset_backoff(&dev_priv->gpu_error))
>               return -EAGAIN;
>  
>       i915_handle_error(dev_priv, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 9164167cd147..be3c81221d11 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>       int ret;
>  
>       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +     GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>  
> -     if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
> +     if (!test_bit(I915_RESET_HANDOFF, &error->flags))
>               return;
>  
>       /* Clear any previous failed attempts at recovery. Time to try again. */
> @@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  wakeup:
>       i915_gem_reset_finish(dev_priv);
>       enable_irq(dev_priv->drm.irq);
> -     wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
> +
> +     clear_bit(I915_RESET_HANDOFF, &error->flags);
> +     wake_up_bit(&error->flags, I915_RESET_HANDOFF);
>       return;
>  
>  error:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5156bcc59dea..0b02a6d710e1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1595,8 +1595,33 @@ struct i915_gpu_error {
>        */
>       unsigned long reset_count;
>  
> +     /**
> +      * flags: Control various stages of the GPU reset
> +      *
> +      * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
> +      * other users acquiring the struct_mutex. To do this we set the
> +      * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
> +      * and then check for that bit before acquiring the struct_mutex (in
> +      * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
> +      * secondary role in preventing two concurrent global reset attempts.
> +      *
> +      * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
> +      * struct_mutex. We try to acquire the struct_mutex in the reset worker,
> +      * but it may be held by some long running waiter (that we cannot
> +      * interrupt without causing trouble). Once we are ready to do the GPU
> +      * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
> +      * they already hold the struct_mutex and want to participate they can
> +      * inspect the bit and do the reset directly, otherwise the worker
> +      * waits for the struct_mutex.
> +      *
> +      * #I915_WEDGED - If reset fails and we can no longer use the GPU,
> +      * we set the #I915_WEDGED bit. Prior to command submission, e.g.
> +      * i915_gem_request_alloc(), this bit is checked and the sequence
> +      * aborted (with -EIO reported to userspace) if set.
> +      */
>       unsigned long flags;
> -#define I915_RESET_IN_PROGRESS       0
> +#define I915_RESET_BACKOFF   0
> +#define I915_RESET_HANDOFF   1
>  #define I915_WEDGED          (BITS_PER_LONG - 1)
>  
>       /**
> @@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs 
> *engine);
>  
>  void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>  
> -static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> +static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> +{
> +     return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
> +}
> +
> +static inline bool i915_reset_handoff(struct i915_gpu_error *error)
>  {
> -     return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
> +     return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> @@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct 
> i915_gpu_error *error)
>       return unlikely(test_bit(I915_WEDGED, &error->flags));
>  }
>  
> -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error 
> *error)
> +static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
>  {
> -     return i915_reset_in_progress(error) | i915_terminally_wedged(error);
> +     return i915_reset_backoff(error) | i915_terminally_wedged(error);
>  }
>  
>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d87983ba536f..ecd1b038318a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  
>       might_sleep();
>  
> -     if (!i915_reset_in_progress(error))
> -             return 0;
> -
>       /*
>        * Only wait 10 seconds for the gpu reset to complete to avoid hanging
>        * userspace. If it takes that long something really bad is going on and
>        * we should simply try to bail out and fail as gracefully as possible.
>        */
>       ret = wait_event_interruptible_timeout(error->reset_queue,
> -                                            !i915_reset_in_progress(error),
> +                                            !i915_reset_backoff(error),
>                                              I915_RESET_TIMEOUT);
>       if (ret == 0) {
>               DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index 1e1d9f2072cd..0e8d1010cecb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct 
> drm_i915_gem_request *req,
>  
>  static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request 
> *request)
>  {
> -     if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
> +     if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
>               return false;
>  
>       __set_current_state(TASK_RUNNING);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e646c4eba65d..495e6a79cacf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>       return ret;
>  }
>  
> -static void i915_error_wake_up(struct drm_i915_private *dev_priv)
> -{
> -     /*
> -      * Notify all waiters for GPU completion events that reset state has
> -      * been changed, and that they need to restart their wait after
> -      * checking for potential errors (and bail out to drop locks if there is
> -      * a gpu reset pending so that i915_error_work_func can acquire them).
> -      */
> -
> -     /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
> -     wake_up_all(&dev_priv->gpu_error.wait_queue);
> -
> -     /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
> -     wake_up_all(&dev_priv->pending_flip_queue);
> -}
> -
>  /**
>   * i915_reset_and_wakeup - do process context error handling work
>   * @dev_priv: i915 device private
> @@ -2676,6 +2660,9 @@ static void i915_reset_and_wakeup(struct 
> drm_i915_private *dev_priv)
>       intel_runtime_pm_get(dev_priv);
>       intel_prepare_reset(dev_priv);
>  
> +     set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> +     wake_up_all(&dev_priv->gpu_error.wait_queue);
> +
>       do {
>               /*
>                * All state reset _must_ be completed before we update the
> @@ -2690,7 +2677,7 @@ static void i915_reset_and_wakeup(struct 
> drm_i915_private *dev_priv)
>  
>               /* We need to wait for anyone holding the lock to wakeup */
>       } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> -                                  I915_RESET_IN_PROGRESS,
> +                                  I915_RESET_HANDOFF,
>                                    TASK_UNINTERRUPTIBLE,
>                                    HZ));
>  
> @@ -2705,6 +2692,7 @@ static void i915_reset_and_wakeup(struct 
> drm_i915_private *dev_priv)
>        * Note: The wake_up also serves as a memory barrier so that
>        * waiters see the updated value of the dev_priv->gpu_error.
>        */
> +     clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
>       wake_up_all(&dev_priv->gpu_error.reset_queue);
>  }
>  
> @@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private 
> *dev_priv,
>       if (!engine_mask)
>               return;
>  
> -     if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> +     if (test_and_set_bit(I915_RESET_BACKOFF,
>                            &dev_priv->gpu_error.flags))
>               return;
>  
> -     /*
> -      * Wakeup waiting processes so that the reset function
> -      * i915_reset_and_wakeup doesn't deadlock trying to grab
> -      * various locks. By bumping the reset counter first, the woken
> -      * processes will see a reset in progress and back off,
> -      * releasing their locks and then wait for the reset completion.
> -      * We must do this for _all_ gpu waiters that might hold locks
> -      * that the reset work needs to acquire.
> -      *
> -      * Note: The wake_up also provides a memory barrier to ensure that the
> -      * waiters see the updated value of the reset flags.
> -      */
> -     i915_error_wake_up(dev_priv);
> -
>       i915_reset_and_wakeup(dev_priv);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_display.c 
> b/drivers/gpu/drm/i915/intel_display.c
> index 4b73513b46c1..5959c9b6dc97 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
>  {
>       struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
>  
> -     if (i915_reset_in_progress(error))
> +     if (i915_reset_backoff(error))
>               return true;
>  
>       if (crtc->reset_count != i915_reset_count(error))
> @@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>               goto cleanup;
>  
>       intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
> -     if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
> +     if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
>               ret = -EIO;
>               goto unlock;
>       }
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
> b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index d4acee6730e9..6ec7c731a267 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
>  
>       /* Check that we can issue a global GPU reset */
>  
> -     set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +     set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +     set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
>  
>       mutex_lock(&i915->drm.struct_mutex);
>       reset_count = i915_reset_count(&i915->gpu_error);
> @@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
>       }
>       mutex_unlock(&i915->drm.struct_mutex);
>  
> -     GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
> +     GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> +     clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>       if (i915_terminally_wedged(&i915->gpu_error))
>               err = -EIO;
>  
> @@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
>  
>       reset_count = i915_reset_count(&rq->i915->gpu_error);
>  
> -     set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
> +     set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
>       wake_up_all(&rq->i915->gpu_error.wait_queue);
>  
>       return reset_count;
> @@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
>  
>       /* Check that we detect a stuck waiter and issue a reset */
>  
> -     set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +     set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>       mutex_lock(&i915->drm.struct_mutex);
>       err = hang_init(&h, i915);
> @@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
>               err = timeout;
>               goto out_rq;
>       }
> -     GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
>  
> +     GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
>       if (i915_reset_count(&i915->gpu_error) == reset_count) {
>               pr_err("No GPU reset recorded!\n");
>               err = -EINVAL;
> @@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
>       hang_fini(&h);
>  unlock:
>       mutex_unlock(&i915->drm.struct_mutex);
> +     clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>       if (i915_terminally_wedged(&i915->gpu_error))
>               return -EIO;
> @@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
>       if (!igt_can_mi_store_dword_imm(i915))
>               return 0;
>  
> +     set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>       mutex_lock(&i915->drm.struct_mutex);
>       err = hang_init(&h, i915);
>       if (err)
> @@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
>  
>                       i915_reset(i915);
>  
> -                     GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
> +                     GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
>                                           &i915->gpu_error.flags));
> +
>                       if (prev->fence.error != -EIO) {
>                               pr_err("GPU reset not recorded on hanging 
> request [fence.error=%d]!\n",
>                                      prev->fence.error);
> @@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
>       hang_fini(&h);
>  unlock:
>       mutex_unlock(&i915->drm.struct_mutex);
> +     clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>       if (i915_terminally_wedged(&i915->gpu_error))
>               return -EIO;
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to