i915: reprogram NOA muxes on context switch when using perf

Chris Wilson Wed, 09 May 2018 02:06:13 -0700

Quoting Chris Wilson (2018-05-09 09:59:09)
> Quoting Lionel Landwerlin (2018-05-08 19:03:45)
> > If some of the contexts submitting workloads to the GPU have been
> > configured to shutdown slices/subslices, we might loose the NOA
> > configurations written in the NOA muxes. We need to reprogram them
> > when we detect a powergating configuration change.
> > 
> > In this change i915/perf is responsible for setting up a reprogramming
> > batchbuffer which we execute just before the userspace submitted
> > batchbuffer. We do this while preemption is still disable, only if
> > needed. The decision to execute this reprogramming batchbuffer is made
> > when we assign a request to an execlist port.
> > 
> > v2: Only reprogram when detecting configuration changes (Chris/Lionel)
> > 
> > Signed-off-by: Lionel Landwerlin <[email protected]>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h           |   6 ++
> >  drivers/gpu/drm/i915/i915_perf.c          | 108 ++++++++++++++++++++++
> >  drivers/gpu/drm/i915/i915_request.h       |   6 ++
> >  drivers/gpu/drm/i915/intel_gpu_commands.h |   1 +
> >  drivers/gpu/drm/i915/intel_lrc.c          |  59 +++++++++++-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c   |   2 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.h   |   2 +
> >  7 files changed, 183 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h 
> > b/drivers/gpu/drm/i915/i915_drv.h
> > index 04e27806e581..07cdbfb4ad7a 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1461,6 +1461,12 @@ struct i915_perf_stream {
> >          * @oa_config: The OA configuration used by the stream.
> >          */
> >         struct i915_oa_config *oa_config;
> > +
> > +       /**
> > +        * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, 
> > used
> > +        * after switching powergating configurations.
> > +        */
> > +       struct i915_vma *noa_reprogram_vma;
> >  };
> >  
> >  /**
> > diff --git a/drivers/gpu/drm/i915/i915_perf.c 
> > b/drivers/gpu/drm/i915/i915_perf.c
> > index 5b279a82445a..1b9e3d6a53a2 100644
> > --- a/drivers/gpu/drm/i915/i915_perf.c
> > +++ b/drivers/gpu/drm/i915/i915_perf.c
> > @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request 
> > *rq,
> >         return 0;
> >  }
> >  
> > +#define MAX_LRI_SIZE (125U)
> > +
> > +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
> > +{
> > +       u32 n_lri;
> > +
> > +       /* Very unlikely but possible that we have no muxes to configure. */
> > +       if (!oa_config->mux_regs_len)
> > +               return 0;
> > +
> > +       n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
> > +               (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
> > +
> > +       return n_lri * 4 + oa_config->mux_regs_len * 8 + /* 
> > MI_LOAD_REGISTER_IMMs */
> > +               6 * 4 + /* PIPE_CONTROL */
> > +               4; /* MI_BATCH_BUFFER_END */
> > +}
> > +
> > +static int
> > +alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
> > +{
> > +       struct drm_i915_private *dev_priv = stream->dev_priv;
> > +       struct i915_oa_config *oa_config = stream->oa_config;
> > +       struct drm_i915_gem_object *bo;
> > +       struct i915_vma *vma;
> > +       u32 buffer_size;
> > +       u32 *cs;
> > +       int i, ret, n_loaded_regs;
> > +
> > +       buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
> > +       if (buffer_size == 0)
> > +               return 0;
> > +
> > +       bo = i915_gem_object_create(dev_priv, buffer_size);
> > +       if (IS_ERR(bo)) {
> > +               DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
> > +               return PTR_ERR(bo);
> > +       }
> > +
> > +       cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
> > +       if (IS_ERR(cs)) {
> > +               ret = PTR_ERR(cs);
> > +               goto err_unref_bo;
> > +       }
> > +
> > +       n_loaded_regs = 0;
> > +       for (i = 0; i < oa_config->mux_regs_len; i++) {
> > +               if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
> > +                       u32 n_lri = min(oa_config->mux_regs_len - 
> > n_loaded_regs,
> > +                                       MAX_LRI_SIZE);
> > +                       *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
> > +               }
> > +
> > +               *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
> > +               *cs++ = oa_config->mux_regs[i].value;
> > +               n_loaded_regs++;
> > +       }
> > +
> > +       cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
> 
> What's the pc for? Isn't it a bit dangerous to request a mmio write to
> to reg 0?
> 
> > +
> > +       *cs++ = MI_BATCH_BUFFER_END;
> > +
> > +       i915_gem_object_unpin_map(bo);
> > +
> > +       ret = i915_gem_object_set_to_gtt_domain(bo, false);
> > +       if (ret)
> > +               goto err_unref_bo;
> > +
> > +       vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
> > +       if (IS_ERR(vma)) {
> > +               ret = PTR_ERR(vma);
> > +               goto err_unref_bo;
> > +       }
> > +
> > +       ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
> > +       if (ret)
> > +               goto err_unref_vma;
> 
> No GGTT access, so just PIN_USER for GPU access.
> 
> > +       stream->noa_reprogram_vma = vma;
> > +
> > +       return 0;
> > +
> > +err_unref_vma:
> > +       i915_vma_put(vma);
> > +err_unref_bo:
> > +       i915_gem_object_put(bo);
> > +
> > +       return ret;
> > +}
> > +
> >  static int gen8_switch_to_updated_kernel_context(struct drm_i915_private 
> > *dev_priv,
> >                                                  const struct 
> > i915_oa_config *oa_config)
> >  {
> > @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct 
> > i915_perf_stream *stream,
> >                 goto err_config;
> >         }
> >  
> > +       ret = alloc_noa_reprogram_bo(stream);
> > +       if (ret) {
> > +               DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
> > +               goto err_config;
> > +       }
> > +
> >         /* PRM - observability performance counters:
> >          *
> >          *   OACONTROL, performance counter enable, note:
> > @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct 
> > i915_perf_stream *stream,
> >  
> >         dev_priv->perf.oa.exclusive_stream = stream;
> >  
> > +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> > +                                                     stream->oa_config);
> > +       if (ret)
> > +               goto err_enable;
> > +
> > +       stream->ops = &i915_oa_stream_ops;
> > +
> >         mutex_unlock(&dev_priv->drm.struct_mutex);
> >  
> >         return 0;
> > @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct 
> > i915_perf_stream *stream)
> >         if (stream->ctx)
> >                 i915_gem_context_put(stream->ctx);
> >  
> > +       if (stream->noa_reprogram_vma) {
> > +               i915_vma_unpin(stream->noa_reprogram_vma);
> > +               i915_gem_object_put(stream->noa_reprogram_vma->obj);
> 
> This will be unhappy due to the lack of active tracking.
> i915_vma_unpin_and_release(&stream->noa_reprogram_vma).
> 
> Hmm, worse than that. It's not being tracked at all, so expect it to be
> freed while still in use.
> 
> > +       }
> > +
> >         kfree(stream);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_request.h 
> > b/drivers/gpu/drm/i915/i915_request.h
> > index eddbd4245cb3..3357ceb4bdb1 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -149,6 +149,12 @@ struct i915_request {
> >         /** Preallocate space in the ring for the emitting the request */
> >         u32 reserved_space;
> >  
> > +       /*
> > +        * Pointer in the batchbuffer to where the i915/perf NOA 
> > reprogramming
> > +        * can be inserted just before HW submission.
> > +        */
> > +       u32 *perf_prog_start;
> 
> Just u32 offset, no need for the extra space of a pointer here.
> (Probably should just limit rings to 64k and pack the offsets into u16.)
> Or have them as offset from head, and u8 dwords? Hmm.
> 
> I have cold shivers from the thought of more special locations inside
> the batch. Not the first, and not the last, so I feel like there should
> be a better way (or at least more consistent).
> 
> > +
> >         /** Batch buffer related to this request if any (used for
> >          * error state dump only).
> >          */
> > diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h 
> > b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > index 105e2a9e874a..09b0fded8b17 100644
> > --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> > +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > @@ -146,6 +146,7 @@
> >  #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
> >  #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
> >  #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
> > +#define   MI_BATCH_SECOND_LEVEL      (1<<22)
> >  
> >  /*
> >   * 3D instructions used by the kernel
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 1c08bd2be6c3..df4a7bb07eae 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct 
> > i915_gem_context *prev,
> >         return true;
> >  }
> >  
> > +static void maybe_enable_noa_repgoram(struct i915_request *rq)
> > +{
> > +       struct intel_engine_cs *engine = rq->engine;
> > +       struct drm_i915_private *dev_priv = engine->i915;
> > +       struct i915_perf_stream *stream = 
> > dev_priv->perf.oa.exclusive_stream;
> 
> Could there be any more pointer chasing? </chandler>


Thinking about more about how to make this part cleaner, could we not
store the engine->noa_batch and then this all becomes

vma = engine->noa_batch;
if (vma)
        return;

Locking! Missed it in the first pass, but we are unlocked here... That
also ties into lifetimes.

I'm thinking more like ctx->noa_batch with that being pinned along with
the context. We need something along those lines to track the
locking/lifetime.
-Chris
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH v3 4/6] drm/i915: reprogram NOA muxes on context switch when using perf

Reply via email to