Chris Wilson <[email protected]> writes:

> Not all mediated environments yet support HW semaphores, so we should
> avoid using one for our preempt-to-busy barrier and instead request that
> the CS be paused and not advance on to the next execlist.
>
> There's a natural advantage with the register writes being serialised
> with the writes to the ELSP register that should allow the barrier to be
> much more constrained. On the other hand, we can no longer track the
> extents of the barrier and so must be a lot more lenient in accepting
> early CS events.
>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Zhenyu Wang <[email protected]>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_types.h |  8 +++
>  drivers/gpu/drm/i915/gt/intel_lrc.c          | 69 ++++++++------------
>  2 files changed, 35 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 7e056114344e..1353df264236 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -166,6 +166,8 @@ struct intel_engine_execlists {
>        */
>       bool no_priolist;
>  
> +     u16 ring_mode;
> +
>       /**
>        * @submit_reg: gen-specific execlist submission register
>        * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
> @@ -179,6 +181,12 @@ struct intel_engine_execlists {
>        */
>       u32 __iomem *ctrl_reg;
>  
> +     /**
> +      * @ring_mode: the engine control register, used to freeze the command
> +      * streamer around preempt-to-busy
> +      */
> +     u32 __iomem *ring_reg;
> +
>  #define EXECLIST_MAX_PORTS 2
>       /**
>        * @active: the currently known context executing on HW
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
> b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 28685ba91a2c..c2aaab4b743e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -224,24 +224,18 @@ static void execlists_init_reg_state(u32 *reg_state,
>                                    struct intel_engine_cs *engine,
>                                    struct intel_ring *ring);
>  
> -static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
> +static inline void ring_set_paused(struct intel_engine_cs *engine, u32 state)
>  {
> -     return (i915_ggtt_offset(engine->status_page.vma) +
> -             I915_GEM_HWS_PREEMPT_ADDR);
> -}
> +     u16 x;
>  
> -static inline void
> -ring_set_paused(const struct intel_engine_cs *engine, int state)
> -{
> -     /*
> -      * We inspect HWS_PREEMPT with a semaphore inside
> -      * engine->emit_fini_breadcrumb. If the dword is true,
> -      * the ring is paused as the semaphore will busywait
> -      * until the dword is false.
> -      */
> -     engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
> -     if (state)
> -             wmb();
> +     x = engine->execlists.ring_mode;
> +     x &= ~(state >> 16);
> +     x |= state;
> +     if (x == engine->execlists.ring_mode)
> +             return;

You want to keep the state to reduce noneffective writes?

I would like ring_freeze() and ring_thaw() and
the state parameter removed, as I don't see the
value in callsites to macro the masked bits.

> +
> +     engine->execlists.ring_mode = x;
> +     writel(state, engine->execlists.ring_reg);
>  }
>  
>  static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> @@ -981,7 +975,7 @@ static void execlists_dequeue(struct intel_engine_cs 
> *engine)
>                        * as we unwind (and until we resubmit) so that we do
>                        * not accidentally tell it to go backwards.
>                        */
> -                     ring_set_paused(engine, 1);
> +                     ring_set_paused(engine, _MASKED_BIT_ENABLE(STOP_RING));
>  
>                       /*
>                        * Note that we have not stopped the GPU at this point,
> @@ -1010,7 +1004,7 @@ static void execlists_dequeue(struct intel_engine_cs 
> *engine)
>                                 last->sched.attr.priority,
>                                 execlists->queue_priority_hint);
>  
> -                     ring_set_paused(engine, 1);
> +                     ring_set_paused(engine, _MASKED_BIT_ENABLE(STOP_RING));
>                       defer_active(engine);
>  
>                       /*
> @@ -1244,9 +1238,10 @@ static void execlists_dequeue(struct intel_engine_cs 
> *engine)
>               *port = execlists_schedule_in(last, port - execlists->pending);
>               memset(port + 1, 0, (last_port - port) * sizeof(*port));
>               execlists_submit_ports(engine);
> -     } else {
> -             ring_set_paused(engine, 0);
>       }
> +
> +     if (!inject_preempt_hang(execlists))
> +             ring_set_paused(engine, _MASKED_BIT_DISABLE(STOP_RING));

This is just a superset of the previous unpausing now?

>  }
>  
>  void
> @@ -1355,9 +1350,6 @@ static void process_csb(struct intel_engine_cs *engine)
>  
>                       if (enable_timeslice(engine))
>                               mod_timer(&execlists->timer, jiffies + 1);
> -
> -                     if (!inject_preempt_hang(execlists))
> -                             ring_set_paused(engine, 0);
>               } else if (status & GEN8_CTX_STATUS_PREEMPTED) {
>                       struct i915_request * const *port = execlists->active;
>  
> @@ -1378,8 +1370,15 @@ static void process_csb(struct intel_engine_cs *engine)
>                        * ordered, that the breadcrumb write is
>                        * coherent (visible from the CPU) before the
>                        * user interrupt and CSB is processed.
> +                      *
> +                      * The caveat here applies when we are injecting
> +                      * a completion event via manipulation of the
> +                      * RING_MI_MODE; this may occur before the
> +                      * request is completed and appears as a
> +                      * normal context-switch (0x14).

As in we unpause and get a completion event?

-Mika


>                        */
> -                     GEM_BUG_ON(!i915_request_completed(rq));
> +                     GEM_BUG_ON(!i915_request_completed(rq) &&
> +                                !execlists->pending[0]);
>                       execlists_schedule_out(rq);
>  
>                       GEM_BUG_ON(execlists->active - execlists->inflight >
> @@ -2133,7 +2132,7 @@ static void reset_csb_pointers(struct intel_engine_cs 
> *engine)
>       struct intel_engine_execlists * const execlists = &engine->execlists;
>       const unsigned int reset_value = execlists->csb_size - 1;
>  
> -     ring_set_paused(engine, 0);
> +     ring_set_paused(engine, _MASKED_BIT_DISABLE(STOP_RING));
>  
>       /*
>        * After a reset, the HW starts writing into CSB entry [0]. We
> @@ -2581,19 +2580,6 @@ static u32 *gen8_emit_wa_tail(struct i915_request 
> *request, u32 *cs)
>       return cs;
>  }
>  
> -static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
> -{
> -     *cs++ = MI_SEMAPHORE_WAIT |
> -             MI_SEMAPHORE_GLOBAL_GTT |
> -             MI_SEMAPHORE_POLL |
> -             MI_SEMAPHORE_SAD_EQ_SDD;
> -     *cs++ = 0;
> -     *cs++ = intel_hws_preempt_address(request->engine);
> -     *cs++ = 0;
> -
> -     return cs;
> -}
> -
>  static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
>  {
>       cs = gen8_emit_ggtt_write(cs,
> @@ -2601,9 +2587,7 @@ static u32 *gen8_emit_fini_breadcrumb(struct 
> i915_request *request, u32 *cs)
>                                 request->timeline->hwsp_offset,
>                                 0);
>       *cs++ = MI_USER_INTERRUPT;
> -
>       *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> -     cs = emit_preempt_busywait(request, cs);
>  
>       request->tail = intel_ring_offset(request, cs);
>       assert_ring_tail_valid(request->ring, request->tail);
> @@ -2625,9 +2609,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct 
> i915_request *request, u32 *cs)
>                                   PIPE_CONTROL_CS_STALL,
>                                   0);
>       *cs++ = MI_USER_INTERRUPT;
> -
>       *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> -     cs = emit_preempt_busywait(request, cs);
>  
>       request->tail = intel_ring_offset(request, cs);
>       assert_ring_tail_valid(request->ring, request->tail);
> @@ -2807,6 +2789,9 @@ int intel_execlists_submission_init(struct 
> intel_engine_cs *engine)
>       execlists->csb_write =
>               &engine->status_page.addr[intel_hws_csb_write_index(i915)];
>  
> +     execlists->ring_reg =
> +             uncore->regs + i915_mmio_reg_offset(RING_MI_MODE(base));
> +
>       if (INTEL_GEN(i915) < 11)
>               execlists->csb_size = GEN8_CSB_ENTRIES;
>       else
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to