Re: [Intel-gfx] [PATCH 15/69] drm/i915/gt: Track all timelines created using the HWSP

2020-12-15 Thread Chris Wilson
Quoting Mika Kuoppala (2020-12-15 17:09:39)
> Chris Wilson  writes:
> 
> > We assume that the contents of the HWSP are lost across suspend, and so
> > upon resume we must restore critical values such as the timeline seqno.
> > Keep track of every timeline allocated that uses the HWSP as its storage
> > and so we can then reset all seqno values by walking that list.
> >
> > Signed-off-by: Chris Wilson 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  9 -
> >  drivers/gpu/drm/i915/gt/intel_engine_pm.c |  6 
> >  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  1 +
> >  .../drm/i915/gt/intel_execlists_submission.c  | 11 --
> >  .../gpu/drm/i915/gt/intel_ring_submission.c   | 35 +++
> >  drivers/gpu/drm/i915/gt/intel_timeline.h  | 13 +--
> >  .../gpu/drm/i915/gt/intel_timeline_types.h|  2 ++
> >  7 files changed, 71 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
> > b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > index 71bd052628f4..6c08e74edcae 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> > @@ -648,6 +648,8 @@ static int init_status_page(struct intel_engine_cs 
> > *engine)
> >   void *vaddr;
> >   int ret;
> >  
> > + INIT_LIST_HEAD(>status_page.timelines);
> > +
> >   /*
> >* Though the HWS register does support 36bit addresses, historically
> >* we have had hangs and corruption reported due to wild writes if
> > @@ -936,6 +938,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs 
> > *engine)
> >   fput(engine->default_state);
> >  
> >   if (engine->kernel_context) {
> > + list_del(>kernel_context->timeline->engine_link);
> >   intel_context_unpin(engine->kernel_context);
> >   intel_context_put(engine->kernel_context);
> >   }
> > @@ -1281,8 +1284,12 @@ void intel_engines_reset_default_submission(struct 
> > intel_gt *gt)
> >   struct intel_engine_cs *engine;
> >   enum intel_engine_id id;
> >  
> > - for_each_engine(engine, gt, id)
> > + for_each_engine(engine, gt, id) {
> > + if (engine->sanitize)
> > + engine->sanitize(engine);
> > +
> >   engine->set_default_submission(engine);
> > + }
> >  }
> >  
> >  bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c 
> > b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > index 99574378047f..1e5bad0b9a82 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > @@ -60,6 +60,12 @@ static int __engine_unpark(struct intel_wakeref *wf)
> >  
> >   /* Scrub the context image after our loss of control */
> >   ce->ops->reset(ce);
> > +
> > + CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n",
> > +  ce->timeline->seqno,
> > +  READ_ONCE(*ce->timeline->hwsp_seqno),
> > +  ce->ring->emit);
> > + GEM_BUG_ON(ce->timeline->seqno != *ce->timeline->hwsp_seqno);
> 
> Compiler should be satified but could still have been READ_ONCE,
> for the reader and for the fine bug on which might get delivered to console. 

Yup, I hesitated as it meant a newline :)

> But main thing is that now coherency is enforced from the get go.

> > +static void xcs_sanitize(struct intel_engine_cs *engine)
> > +{
> > + /*
> > +  * Poison residual state on resume, in case the suspend didn't!
> > +  *
> > +  * We have to assume that across suspend/resume (or other loss
> > +  * of control) that the contents of our pinned buffers has been
> > +  * lost, replaced by garbage. Since this doesn't always happen,
> > +  * let's poison such state so that we more quickly spot when
> > +  * we falsely assume it has been preserved.
> > +  */
> > + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> > + memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
> > +
> > + /*
> > +  * The kernel_context HWSP is stored in the status_page. As above,
> > +  * that may be lost on resume/initialisation, and so we need to
> > +  * reset the value in the HWSP.
> > +  */
> > + sanitize_hwsp(engine);
> > +
> > + /* And scrub the dirty cachelines for the HWSP */
> > + clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
> 
> The flush could be part of the actual writing of the seqno with
> that range. But then you would need to track the debug so. Better
> to make sure to transfer everything to be visible.

Yes. It's also just part of the general 'sanitize' paranoia; we scrub
everything until we are convinced that we only see our own bugs
reflected back at us.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org

Re: [Intel-gfx] [PATCH 15/69] drm/i915/gt: Track all timelines created using the HWSP

2020-12-15 Thread Mika Kuoppala
Chris Wilson  writes:

> We assume that the contents of the HWSP are lost across suspend, and so
> upon resume we must restore critical values such as the timeline seqno.
> Keep track of every timeline allocated that uses the HWSP as its storage
> and so we can then reset all seqno values by walking that list.
>
> Signed-off-by: Chris Wilson 
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  9 -
>  drivers/gpu/drm/i915/gt/intel_engine_pm.c |  6 
>  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  1 +
>  .../drm/i915/gt/intel_execlists_submission.c  | 11 --
>  .../gpu/drm/i915/gt/intel_ring_submission.c   | 35 +++
>  drivers/gpu/drm/i915/gt/intel_timeline.h  | 13 +--
>  .../gpu/drm/i915/gt/intel_timeline_types.h|  2 ++
>  7 files changed, 71 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 71bd052628f4..6c08e74edcae 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -648,6 +648,8 @@ static int init_status_page(struct intel_engine_cs 
> *engine)
>   void *vaddr;
>   int ret;
>  
> + INIT_LIST_HEAD(>status_page.timelines);
> +
>   /*
>* Though the HWS register does support 36bit addresses, historically
>* we have had hangs and corruption reported due to wild writes if
> @@ -936,6 +938,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs 
> *engine)
>   fput(engine->default_state);
>  
>   if (engine->kernel_context) {
> + list_del(>kernel_context->timeline->engine_link);
>   intel_context_unpin(engine->kernel_context);
>   intel_context_put(engine->kernel_context);
>   }
> @@ -1281,8 +1284,12 @@ void intel_engines_reset_default_submission(struct 
> intel_gt *gt)
>   struct intel_engine_cs *engine;
>   enum intel_engine_id id;
>  
> - for_each_engine(engine, gt, id)
> + for_each_engine(engine, gt, id) {
> + if (engine->sanitize)
> + engine->sanitize(engine);
> +
>   engine->set_default_submission(engine);
> + }
>  }
>  
>  bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c 
> b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index 99574378047f..1e5bad0b9a82 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -60,6 +60,12 @@ static int __engine_unpark(struct intel_wakeref *wf)
>  
>   /* Scrub the context image after our loss of control */
>   ce->ops->reset(ce);
> +
> + CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n",
> +  ce->timeline->seqno,
> +  READ_ONCE(*ce->timeline->hwsp_seqno),
> +  ce->ring->emit);
> + GEM_BUG_ON(ce->timeline->seqno != *ce->timeline->hwsp_seqno);

Compiler should be satified but could still have been READ_ONCE,
for the reader and for the fine bug on which might get delivered to console. 

But main thing is that now coherency is enforced from the get go.

>   }
>  
>   if (engine->unpark)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index e71eef157231..c28f4e190fe6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -68,6 +68,7 @@ typedef u8 intel_engine_mask_t;
>  #define ALL_ENGINES ((intel_engine_mask_t)~0ul)
>  
>  struct intel_hw_status_page {
> + struct list_head timelines;
>   struct i915_vma *vma;
>   u32 *addr;
>  };
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 9f5efff08785..c5b013cc10b3 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -3508,7 +3508,6 @@ static int execlists_context_alloc(struct intel_context 
> *ce)
>  
>  static void execlists_context_reset(struct intel_context *ce)
>  {
> - CE_TRACE(ce, "reset\n");
>   GEM_BUG_ON(!intel_context_is_pinned(ce));
>  
>   intel_ring_reset(ce->ring, ce->ring->emit);
> @@ -3985,6 +3984,14 @@ static void reset_csb_pointers(struct intel_engine_cs 
> *engine)
>   GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
>  }
>  
> +static void sanitize_hwsp(struct intel_engine_cs *engine)
> +{
> + struct intel_timeline *tl;
> +
> + list_for_each_entry(tl, >status_page.timelines, engine_link)
> + intel_timeline_reset_seqno(tl);
> +}
> +
>  static void execlists_sanitize(struct intel_engine_cs *engine)
>  {
>   GEM_BUG_ON(execlists_active(>execlists));
> @@ -4008,7 +4015,7 @@ static void execlists_sanitize(struct intel_engine_cs 
> *engine)
>* that may be lost on 

[Intel-gfx] [PATCH 15/69] drm/i915/gt: Track all timelines created using the HWSP

2020-12-14 Thread Chris Wilson
We assume that the contents of the HWSP are lost across suspend, and so
upon resume we must restore critical values such as the timeline seqno.
Keep track of every timeline allocated that uses the HWSP as its storage
and so we can then reset all seqno values by walking that list.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  9 -
 drivers/gpu/drm/i915/gt/intel_engine_pm.c |  6 
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  1 +
 .../drm/i915/gt/intel_execlists_submission.c  | 11 --
 .../gpu/drm/i915/gt/intel_ring_submission.c   | 35 +++
 drivers/gpu/drm/i915/gt/intel_timeline.h  | 13 +--
 .../gpu/drm/i915/gt/intel_timeline_types.h|  2 ++
 7 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 71bd052628f4..6c08e74edcae 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -648,6 +648,8 @@ static int init_status_page(struct intel_engine_cs *engine)
void *vaddr;
int ret;
 
+   INIT_LIST_HEAD(>status_page.timelines);
+
/*
 * Though the HWS register does support 36bit addresses, historically
 * we have had hangs and corruption reported due to wild writes if
@@ -936,6 +938,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs 
*engine)
fput(engine->default_state);
 
if (engine->kernel_context) {
+   list_del(>kernel_context->timeline->engine_link);
intel_context_unpin(engine->kernel_context);
intel_context_put(engine->kernel_context);
}
@@ -1281,8 +1284,12 @@ void intel_engines_reset_default_submission(struct 
intel_gt *gt)
struct intel_engine_cs *engine;
enum intel_engine_id id;
 
-   for_each_engine(engine, gt, id)
+   for_each_engine(engine, gt, id) {
+   if (engine->sanitize)
+   engine->sanitize(engine);
+
engine->set_default_submission(engine);
+   }
 }
 
 bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c 
b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 99574378047f..1e5bad0b9a82 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -60,6 +60,12 @@ static int __engine_unpark(struct intel_wakeref *wf)
 
/* Scrub the context image after our loss of control */
ce->ops->reset(ce);
+
+   CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n",
+ce->timeline->seqno,
+READ_ONCE(*ce->timeline->hwsp_seqno),
+ce->ring->emit);
+   GEM_BUG_ON(ce->timeline->seqno != *ce->timeline->hwsp_seqno);
}
 
if (engine->unpark)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index e71eef157231..c28f4e190fe6 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -68,6 +68,7 @@ typedef u8 intel_engine_mask_t;
 #define ALL_ENGINES ((intel_engine_mask_t)~0ul)
 
 struct intel_hw_status_page {
+   struct list_head timelines;
struct i915_vma *vma;
u32 *addr;
 };
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 9f5efff08785..c5b013cc10b3 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3508,7 +3508,6 @@ static int execlists_context_alloc(struct intel_context 
*ce)
 
 static void execlists_context_reset(struct intel_context *ce)
 {
-   CE_TRACE(ce, "reset\n");
GEM_BUG_ON(!intel_context_is_pinned(ce));
 
intel_ring_reset(ce->ring, ce->ring->emit);
@@ -3985,6 +3984,14 @@ static void reset_csb_pointers(struct intel_engine_cs 
*engine)
GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
 }
 
+static void sanitize_hwsp(struct intel_engine_cs *engine)
+{
+   struct intel_timeline *tl;
+
+   list_for_each_entry(tl, >status_page.timelines, engine_link)
+   intel_timeline_reset_seqno(tl);
+}
+
 static void execlists_sanitize(struct intel_engine_cs *engine)
 {
GEM_BUG_ON(execlists_active(>execlists));
@@ -4008,7 +4015,7 @@ static void execlists_sanitize(struct intel_engine_cs 
*engine)
 * that may be lost on resume/initialisation, and so we need to
 * reset the value in the HWSP.
 */
-   intel_timeline_reset_seqno(engine->kernel_context->timeline);
+   sanitize_hwsp(engine);
 
/* And scrub the dirty cachelines for the HWSP */
clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c