Quoting Chris Wilson (2018-05-09 09:59:09)
> Quoting Lionel Landwerlin (2018-05-08 19:03:45)
> > If some of the contexts submitting workloads to the GPU have been
> > configured to shutdown slices/subslices, we might loose the NOA
> > configurations written in the NOA muxes. We need to reprogram them
> > when we detect a powergating configuration change.
> >
> > In this change i915/perf is responsible for setting up a reprogramming
> > batchbuffer which we execute just before the userspace submitted
> > batchbuffer. We do this while preemption is still disable, only if
> > needed. The decision to execute this reprogramming batchbuffer is made
> > when we assign a request to an execlist port.
> >
> > v2: Only reprogram when detecting configuration changes (Chris/Lionel)
> >
> > Signed-off-by: Lionel Landwerlin <[email protected]>
> > ---
> > drivers/gpu/drm/i915/i915_drv.h | 6 ++
> > drivers/gpu/drm/i915/i915_perf.c | 108 ++++++++++++++++++++++
> > drivers/gpu/drm/i915/i915_request.h | 6 ++
> > drivers/gpu/drm/i915/intel_gpu_commands.h | 1 +
> > drivers/gpu/drm/i915/intel_lrc.c | 59 +++++++++++-
> > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 +
> > drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +
> > 7 files changed, 183 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > b/drivers/gpu/drm/i915/i915_drv.h
> > index 04e27806e581..07cdbfb4ad7a 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -1461,6 +1461,12 @@ struct i915_perf_stream {
> > * @oa_config: The OA configuration used by the stream.
> > */
> > struct i915_oa_config *oa_config;
> > +
> > + /**
> > + * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes,
> > used
> > + * after switching powergating configurations.
> > + */
> > + struct i915_vma *noa_reprogram_vma;
> > };
> >
> > /**
> > diff --git a/drivers/gpu/drm/i915/i915_perf.c
> > b/drivers/gpu/drm/i915/i915_perf.c
> > index 5b279a82445a..1b9e3d6a53a2 100644
> > --- a/drivers/gpu/drm/i915/i915_perf.c
> > +++ b/drivers/gpu/drm/i915/i915_perf.c
> > @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request
> > *rq,
> > return 0;
> > }
> >
> > +#define MAX_LRI_SIZE (125U)
> > +
> > +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
> > +{
> > + u32 n_lri;
> > +
> > + /* Very unlikely but possible that we have no muxes to configure. */
> > + if (!oa_config->mux_regs_len)
> > + return 0;
> > +
> > + n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
> > + (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
> > +
> > + return n_lri * 4 + oa_config->mux_regs_len * 8 + /*
> > MI_LOAD_REGISTER_IMMs */
> > + 6 * 4 + /* PIPE_CONTROL */
> > + 4; /* MI_BATCH_BUFFER_END */
> > +}
> > +
> > +static int
> > +alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
> > +{
> > + struct drm_i915_private *dev_priv = stream->dev_priv;
> > + struct i915_oa_config *oa_config = stream->oa_config;
> > + struct drm_i915_gem_object *bo;
> > + struct i915_vma *vma;
> > + u32 buffer_size;
> > + u32 *cs;
> > + int i, ret, n_loaded_regs;
> > +
> > + buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
> > + if (buffer_size == 0)
> > + return 0;
> > +
> > + bo = i915_gem_object_create(dev_priv, buffer_size);
> > + if (IS_ERR(bo)) {
> > + DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
> > + return PTR_ERR(bo);
> > + }
> > +
> > + cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
> > + if (IS_ERR(cs)) {
> > + ret = PTR_ERR(cs);
> > + goto err_unref_bo;
> > + }
> > +
> > + n_loaded_regs = 0;
> > + for (i = 0; i < oa_config->mux_regs_len; i++) {
> > + if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
> > + u32 n_lri = min(oa_config->mux_regs_len -
> > n_loaded_regs,
> > + MAX_LRI_SIZE);
> > + *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
> > + }
> > +
> > + *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
> > + *cs++ = oa_config->mux_regs[i].value;
> > + n_loaded_regs++;
> > + }
> > +
> > + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
>
> What's the pc for? Isn't it a bit dangerous to request a mmio write to
> to reg 0?
>
> > +
> > + *cs++ = MI_BATCH_BUFFER_END;
> > +
> > + i915_gem_object_unpin_map(bo);
> > +
> > + ret = i915_gem_object_set_to_gtt_domain(bo, false);
> > + if (ret)
> > + goto err_unref_bo;
> > +
> > + vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
> > + if (IS_ERR(vma)) {
> > + ret = PTR_ERR(vma);
> > + goto err_unref_bo;
> > + }
> > +
> > + ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
> > + if (ret)
> > + goto err_unref_vma;
>
> No GGTT access, so just PIN_USER for GPU access.
>
> > + stream->noa_reprogram_vma = vma;
> > +
> > + return 0;
> > +
> > +err_unref_vma:
> > + i915_vma_put(vma);
> > +err_unref_bo:
> > + i915_gem_object_put(bo);
> > +
> > + return ret;
> > +}
> > +
> > static int gen8_switch_to_updated_kernel_context(struct drm_i915_private
> > *dev_priv,
> > const struct
> > i915_oa_config *oa_config)
> > {
> > @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct
> > i915_perf_stream *stream,
> > goto err_config;
> > }
> >
> > + ret = alloc_noa_reprogram_bo(stream);
> > + if (ret) {
> > + DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
> > + goto err_config;
> > + }
> > +
> > /* PRM - observability performance counters:
> > *
> > * OACONTROL, performance counter enable, note:
> > @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct
> > i915_perf_stream *stream,
> >
> > dev_priv->perf.oa.exclusive_stream = stream;
> >
> > + ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> > + stream->oa_config);
> > + if (ret)
> > + goto err_enable;
> > +
> > + stream->ops = &i915_oa_stream_ops;
> > +
> > mutex_unlock(&dev_priv->drm.struct_mutex);
> >
> > return 0;
> > @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct
> > i915_perf_stream *stream)
> > if (stream->ctx)
> > i915_gem_context_put(stream->ctx);
> >
> > + if (stream->noa_reprogram_vma) {
> > + i915_vma_unpin(stream->noa_reprogram_vma);
> > + i915_gem_object_put(stream->noa_reprogram_vma->obj);
>
> This will be unhappy due to the lack of active tracking.
> i915_vma_unpin_and_release(&stream->noa_reprogram_vma).
>
> Hmm, worse than that. It's not being tracked at all, so expect it to be
> freed while still in use.
>
> > + }
> > +
> > kfree(stream);
> > }
> >
> > diff --git a/drivers/gpu/drm/i915/i915_request.h
> > b/drivers/gpu/drm/i915/i915_request.h
> > index eddbd4245cb3..3357ceb4bdb1 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -149,6 +149,12 @@ struct i915_request {
> > /** Preallocate space in the ring for the emitting the request */
> > u32 reserved_space;
> >
> > + /*
> > + * Pointer in the batchbuffer to where the i915/perf NOA
> > reprogramming
> > + * can be inserted just before HW submission.
> > + */
> > + u32 *perf_prog_start;
>
> Just u32 offset, no need for the extra space of a pointer here.
> (Probably should just limit rings to 64k and pack the offsets into u16.)
> Or have them as offset from head, and u8 dwords? Hmm.
>
> I have cold shivers from the thought of more special locations inside
> the batch. Not the first, and not the last, so I feel like there should
> be a better way (or at least more consistent).
>
> > +
> > /** Batch buffer related to this request if any (used for
> > * error state dump only).
> > */
> > diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h
> > b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > index 105e2a9e874a..09b0fded8b17 100644
> > --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> > +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> > @@ -146,6 +146,7 @@
> > #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
> > #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
> > #define MI_BATCH_RESOURCE_STREAMER (1<<10)
> > +#define MI_BATCH_SECOND_LEVEL (1<<22)
> >
> > /*
> > * 3D instructions used by the kernel
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 1c08bd2be6c3..df4a7bb07eae 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct
> > i915_gem_context *prev,
> > return true;
> > }
> >
> > +static void maybe_enable_noa_repgoram(struct i915_request *rq)
> > +{
> > + struct intel_engine_cs *engine = rq->engine;
> > + struct drm_i915_private *dev_priv = engine->i915;
> > + struct i915_perf_stream *stream =
> > dev_priv->perf.oa.exclusive_stream;
>
> Could there be any more pointer chasing? </chandler>
Thinking about more about how to make this part cleaner, could we not
store the engine->noa_batch and then this all becomes
vma = engine->noa_batch;
if (vma)
return;
Locking! Missed it in the first pass, but we are unlocked here... That
also ties into lifetimes.
I'm thinking more like ctx->noa_batch with that being pinned along with
the context. We need something along those lines to track the
locking/lifetime.
-Chris
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx