Chris Wilson <[email protected]> writes:

> Following a reset, the context and page directory registers are lost.
> However, the queue of requests that we resubmit after the reset may
> depend upon them - the registers are restored from a context image, but
> that restore may be inhibited and may simply be absent from the request
> if it was in the middle of a sequence using the same context. If we
> prime the CCID/PD registers with the first request in the queue (even
> for the hung request), we prevent invalid memory access for the
> following requests (and continually hung engines).
>
> Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Tvrtko Ursulin <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> ---
>
> This could do with going to stable but requires a few odds and ends, such
> as dma_fence_set_error(). Oh well, fortunately it is not as bad it might
> seem since these registers are restored from the context - but that then
> requires a mesa context to reset the GPU state (as fortunately we called
> MI_SET_CONTEXT at the start of every batch!), but any other request in
> the meantime will likely hang again.
>
> (Also I left gen8/ringbuffer reset_hw as an exercise for the reader)
> -Chris
>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         | 18 +++++++-----------
>  drivers/gpu/drm/i915/intel_lrc.c        |  6 +++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 30 +++++++++++++++++++++++++++---
>  3 files changed, 39 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index fe32b1edbda2..7a16c859c875 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2735,21 +2735,17 @@ static void i915_gem_reset_engine(struct 
> intel_engine_cs *engine)
>               engine->irq_seqno_barrier(engine);
>  
>       request = i915_gem_find_active_request(engine);
> -     if (!request)
> -             return;
> -
> -     if (!i915_gem_reset_request(request))
> -             return;
> +     if (request && i915_gem_reset_request(request)) {
> +             DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 
> 0x%x\n",
> +                              engine->name, request->global_seqno);
>  
> -     DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
> -                      engine->name, request->global_seqno);
> +             /* If this context is now banned, skip all pending requests. */
> +             if (i915_gem_context_is_banned(request->ctx))
> +                     engine_skip_context(request);
> +     }
>  
>       /* Setup the CS to resume from the breadcrumb of the hung request */
>       engine->reset_hw(engine, request);
> -
> -     /* If this context is now banned, skip all of its pending requests. */
> -     if (i915_gem_context_is_banned(request->ctx))
> -             engine_skip_context(request);
>  }
>  
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 44a92ea464ba..f44dd42e46b5 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1324,7 +1324,10 @@ static void reset_common_ring(struct intel_engine_cs 
> *engine,
>                             struct drm_i915_gem_request *request)
>  {
>       struct execlist_port *port = engine->execlist_port;
> -     struct intel_context *ce = &request->ctx->engine[engine->id];
> +     struct intel_context *ce;
> +
> +     if (!request || request->fence.error != -EIO)
> +             return;
>  
>       /* We want a simple context + ring to execute the breadcrumb update.
>        * We cannot rely on the context being intact across the GPU hang,
> @@ -1333,6 +1336,7 @@ static void reset_common_ring(struct intel_engine_cs 
> *engine,
>        * future request will be after userspace has had the opportunity
>        * to recreate its own state.
>        */
> +     ce = &request->ctx->engine[engine->id];
>       execlists_init_reg_state(ce->lrc_reg_state,
>                                request->ctx, engine, ce->ring);
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index d32cbba25d98..b30c1dc3061c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -599,10 +599,34 @@ static int init_ring_common(struct intel_engine_cs 
> *engine)
>  static void reset_ring_common(struct intel_engine_cs *engine,
>                             struct drm_i915_gem_request *request)
>  {
> -     struct intel_ring *ring = request->ring;
> +     if (request) {
> +             struct drm_i915_private *dev_priv = request->i915;
> +             struct intel_context *ce = &request->ctx->engine[engine->id];
> +             struct i915_hw_ppgtt *ppgtt;
> +
> +             if (ce->state) {
> +                     I915_WRITE(CCID,
> +                                i915_ggtt_offset(ce->state) |
> +                                BIT(3) | BIT(2) | CCID_EN);

Can we assume here that the hw reset have made the ringbuffer
empty (as hw head == tail == 0)?

BIT8 should be set.
Bits 3|2 shuold be EXTENDED_STATE_SAVE|RESTORE_ENABLE.

Does the BIT2 affect the way the context will be loaded?

-Mika

> +             }
>  
> -     ring->head = request->postfix;
> -     ring->last_retired_head = -1;
> +             ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
> +             if (ppgtt) {
> +                     u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
> +
> +                     I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
> +                     I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
> +             }
> +
> +             if (request->fence.error == -EIO) {
> +                     struct intel_ring *ring = request->ring;
> +
> +                     ring->head = request->postfix;
> +                     ring->last_retired_head = -1;
> +             }
> +     } else {
> +             engine->legacy_active_context = NULL;
> +     }
>  }
>  
>  static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
> -- 
> 2.11.0
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to