Chris Wilson <ch...@chris-wilson.co.uk> writes:

> If the HW fail to ack a change in forcewake status, the machine is as
> good as dead -- it may recover, but in reality it missed the mmio
> updates and is now in a very inconsistent state. If it happens, we can't
> trust the CI results (or at least the fails may be genuine but due to
> the HW being dead and not the actual test!) so reboot the machine (CI
> checks for a kernel taint in between each test and reboots if the
> machine is tainted).
>
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursu...@linux.intel.com>

Sounds and looks reasonable. Should we also taint if we have
unclaimed mmio after init sequence?

Reviewed-by: Mika Kuoppala <mika.kuopp...@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/gt/intel_reset.c |  2 +-
>  drivers/gpu/drm/i915/i915_drv.h       | 11 +++++++++++
>  drivers/gpu/drm/i915/intel_uncore.c   |  8 ++++++--
>  3 files changed, 18 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
> b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 419b3415370b..464369bc55ad 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1042,7 +1042,7 @@ void i915_reset(struct drm_i915_private *i915,
>        * rather than continue on into oblivion. For everyone else,
>        * the system should still plod along, but they have been warned!
>        */
> -     add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> +     add_taint_for_CI(TAINT_WARN);
>  error:
>       __i915_gem_set_wedged(i915);
>       goto finish;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0a6ec61496f1..d0257808734c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3375,4 +3375,15 @@ static inline u32 i915_scratch_offset(const struct 
> drm_i915_private *i915)
>       return i915_ggtt_offset(i915->gt.scratch);
>  }
>  
> +static inline void add_taint_for_CI(unsigned int taint)
> +{
> +     /*
> +      * The system is "ok", just about surviving for the user, but
> +      * CI results are now unreliable as the HW is very suspect.
> +      * CI checks the taint state after every test and will reboot
> +      * the machine if the kernel is tainted.
> +      */
> +     add_taint(taint, LOCKDEP_STILL_OK);
> +}
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
> b/drivers/gpu/drm/i915/intel_uncore.c
> index d1d51e1121e2..6ec1bc97b665 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -111,9 +111,11 @@ wait_ack_set(const struct intel_uncore_forcewake_domain 
> *d,
>  static inline void
>  fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
>  {
> -     if (wait_ack_clear(d, FORCEWAKE_KERNEL))
> +     if (wait_ack_clear(d, FORCEWAKE_KERNEL)) {
>               DRM_ERROR("%s: timed out waiting for forcewake ack to clear.\n",
>                         intel_uncore_forcewake_domain_to_str(d->id));
> +             add_taint_for_CI(TAINT_WARN); /* CI unreliable */
> +     }
>  }
>  
>  enum ack_type {
> @@ -186,9 +188,11 @@ fw_domain_get(const struct intel_uncore_forcewake_domain 
> *d)
>  static inline void
>  fw_domain_wait_ack_set(const struct intel_uncore_forcewake_domain *d)
>  {
> -     if (wait_ack_set(d, FORCEWAKE_KERNEL))
> +     if (wait_ack_set(d, FORCEWAKE_KERNEL)) {
>               DRM_ERROR("%s: timed out waiting for forcewake ack request.\n",
>                         intel_uncore_forcewake_domain_to_str(d->id));
> +             add_taint_for_CI(TAINT_WARN); /* CI unreliable */
> +     }
>  }
>  
>  static inline void
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to