Chris Wilson <[email protected]> writes:

> When injecting rapid resets, we must be careful to at least wait for the
> previous reset to have taken effect and the engine restarted. If we
> perform a second reset before that has happened, we will notice that the
> engine hasn't recovered and declare it lost, wedging the device and
> failing. In practice, since we wait for each hanging batch to start
> before injecting the reset, this too-fast-reset condition can only be
> triggered when moving onto the next engine in the test, so we need only
> wait for the existing reset to complete before switching engines.
>
> v2: Wrap up the wait inside a safety net to bail out in case of angry hw.
>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> Cc: Michel Thierry <[email protected]>

Reviewed-by: Mika Kuoppala <[email protected]>

> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 65 
> ++++++++++++++++++++++--
>  1 file changed, 62 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
> b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index d1f91a533afa..a4f4ff22389b 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -244,6 +244,57 @@ static u32 hws_seqno(const struct hang *h,
>       return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
>  }
>  
> +struct wedge_me {
> +     struct delayed_work work;
> +     struct drm_i915_private *i915;
> +     const void *symbol;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +     pr_err("%pS timed out, cancelling all further testing.\n",
> +            w->symbol);
> +     i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __init_wedge(struct wedge_me *w,
> +                      struct drm_i915_private *i915,
> +                      long timeout,
> +                      const void *symbol)
> +{
> +     w->i915 = i915;
> +     w->symbol = symbol;
> +
> +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +     schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __fini_wedge(struct wedge_me *w)
> +{
> +     cancel_delayed_work_sync(&w->work);
> +     destroy_delayed_work_on_stack(&w->work);
> +     w->i915 = NULL;
> +}
> +
> +#define wedge_on_timeout(W, DEV, TIMEOUT)                            \
> +     for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
> +          (W)->i915;                                                 \
> +          __fini_wedge((W)))
> +
> +static int flush_test(struct drm_i915_private *i915, unsigned int flags)
> +{
> +     struct wedge_me w;
> +
> +     cond_resched();
> +
> +     wedge_on_timeout(&w, i915, HZ)
> +             i915_gem_wait_for_idle(i915, flags);
> +
> +     return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> +}
> +
>  static void hang_fini(struct hang *h)
>  {
>       *h->batch = MI_BATCH_BUFFER_END;
> @@ -255,7 +306,7 @@ static void hang_fini(struct hang *h)
>       i915_gem_object_unpin_map(h->hws);
>       i915_gem_object_put(h->hws);
>  
> -     i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
> +     flush_test(h->i915, I915_WAIT_LOCKED);
>  }
>  
>  static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> @@ -487,7 +538,9 @@ static int __igt_reset_engine(struct drm_i915_private 
> *i915, bool active)
>               if (err)
>                       break;
>  
> -             cond_resched();
> +             err = flush_test(i915, 0);
> +             if (err)
> +                     break;
>       }
>  
>       if (i915_terminally_wedged(&i915->gpu_error))
> @@ -726,7 +779,9 @@ static int __igt_reset_engine_others(struct 
> drm_i915_private *i915,
>               if (err)
>                       break;
>  
> -             cond_resched();
> +             err = flush_test(i915, 0);
> +             if (err)
> +                     break;
>       }
>  
>       if (i915_terminally_wedged(&i915->gpu_error))
> @@ -952,6 +1007,10 @@ static int igt_reset_queue(void *arg)
>               i915_gem_chipset_flush(i915);
>  
>               i915_gem_request_put(prev);
> +
> +             err = flush_test(i915, I915_WAIT_LOCKED);
> +             if (err)
> +                     break;
>       }
>  
>  fini:
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to