If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active. There is no known issue with this apart from a performance
penality for some media workloads that benefit from running on a
partially powergated GPU. We already prevent RC6 from affecting the
programming so it doesn't sound completely unreasonable to hold on
powergating for the same reason.

Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c       | 24 +++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_request.c    |  2 ++
 drivers/gpu/drm/i915/i915_request.h    | 11 +++++++++++
 drivers/gpu/drm/i915/intel_engine_cs.c |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c       | 26 +++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_lrc.h       |  3 +++
 6 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index fc5b5d66abcd..9cfbd5075097 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1577,7 +1577,8 @@ static void hsw_disable_metric_set(struct 
drm_i915_private *dev_priv)
  */
 static void gen8_update_reg_state_unlocked(struct i915_gem_context *ctx,
                                           u32 *reg_state,
-                                          const struct i915_oa_config 
*oa_config)
+                                          const struct i915_oa_config 
*oa_config,
+                                          union intel_sseu sseu)
 {
        struct drm_i915_private *dev_priv = ctx->i915;
        u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset;
@@ -1623,6 +1624,9 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
                CTX_REG(reg_state, state_offset, flex_regs[i], value);
        }
+
+       CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+               gen8_make_rpcs(&INTEL_INFO(dev_priv)->sseu, sseu));
 }
 
 /*
@@ -1754,6 +1758,8 @@ static int gen8_configure_all_contexts(struct 
drm_i915_private *dev_priv,
                                       const struct i915_oa_config *oa_config)
 {
        struct intel_engine_cs *engine = dev_priv->engine[RCS];
+       union intel_sseu default_sseu =
+               intel_sseu_from_device_sseu(&INTEL_INFO(dev_priv)->sseu);
        struct i915_gem_context *ctx;
        int ret;
        unsigned int wait_flags = I915_WAIT_LOCKED;
@@ -1798,7 +1804,8 @@ static int gen8_configure_all_contexts(struct 
drm_i915_private *dev_priv,
                ce->state->obj->mm.dirty = true;
                regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
 
-               gen8_update_reg_state_unlocked(ctx, regs, oa_config);
+               gen8_update_reg_state_unlocked(ctx, regs, oa_config,
+                                              oa_config ? default_sseu : 
ce->sseu);
 
                i915_gem_object_unpin_map(ce->state->obj);
        }
@@ -2170,14 +2177,21 @@ void i915_oa_init_reg_state(struct intel_engine_cs 
*engine,
                            struct i915_gem_context *ctx,
                            u32 *reg_state)
 {
+       struct drm_i915_private *dev_priv = engine->i915;
        struct i915_perf_stream *stream;
 
        if (engine->id != RCS)
                return;
 
-       stream = engine->i915->perf.oa.exclusive_stream;
-       if (stream)
-               gen8_update_reg_state_unlocked(ctx, reg_state, 
stream->oa_config);
+       stream = dev_priv->perf.oa.exclusive_stream;
+       if (stream) {
+               union intel_sseu default_sseu =
+                       
intel_sseu_from_device_sseu(&INTEL_INFO(dev_priv)->sseu);
+
+               gen8_update_reg_state_unlocked(ctx, reg_state,
+                                              stream->oa_config,
+                                              default_sseu);
+       }
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index 8928894dd9c7..dd0b37e0a85c 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -786,6 +786,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct 
i915_gem_context *ctx)
        rq->capture_list = NULL;
        rq->waitboost = false;
 
+       rq->sseu = ctx->__engine[engine->id].sseu;
+
        /*
         * Reserve space in the ring buffer for all the commands required to
         * eventually emit this request. This is to guarantee that the
diff --git a/drivers/gpu/drm/i915/i915_request.h 
b/drivers/gpu/drm/i915/i915_request.h
index beb312ac9aa0..b4191d382145 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -162,6 +162,17 @@ struct i915_request {
        /** Preallocate space in the ring for the emitting the request */
        u32 reserved_space;
 
+       /*
+        * Position in the ring batchbuffer to where the i915/perf NOA
+        * reprogramming can be inserted just before HW submission.
+        */
+       u32 perf_prog;
+
+       /*
+        * Powergating configuration associated with this request.
+        */
+       union intel_sseu sseu;
+
        /** Batch buffer related to this request if any (used for
         * error state dump only).
         */
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 6bfd7e3ed152..9f86e40f22a7 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -493,6 +493,8 @@ void intel_engine_setup_common(struct intel_engine_cs 
*engine)
 {
        i915_timeline_init(engine->i915, &engine->timeline, engine->name);
 
+       memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
+
        intel_engine_init_execlist(engine);
        intel_engine_init_hangcheck(engine);
        intel_engine_init_batch_pool(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f188ba1b5608..0e93ad90d039 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1956,10 +1956,26 @@ static int gen8_emit_bb_start(struct i915_request *rq,
                rq->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(rq->engine);
        }
 
-       cs = intel_ring_begin(rq, 6);
+       cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
        if (IS_ERR(cs))
                return PTR_ERR(cs);
 
+       if (rq->engine->id == RCS) {
+               /*
+                * Leave some instructions to be written with an
+                * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
+                * batchbuffer. We only turn those MI_NOOP into
+                * MI_BATCH_BUFFER_START when we detect a SSEU powergating
+                * configuration change that might affect NOA. This is only
+                * for the RCS.
+                */
+               rq->perf_prog = intel_ring_offset(rq, cs);
+               *cs++ = MI_NOOP;
+               *cs++ = MI_NOOP;
+               *cs++ = MI_NOOP;
+               *cs++ = MI_NOOP; /* Aligning to 2 dwords */
+       }
+
        /*
         * WaDisableCtxRestoreArbitration:bdw,chv
         *
@@ -2392,8 +2408,8 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine)
        return logical_ring_init(engine);
 }
 
-static u32 make_rpcs(const struct sseu_dev_info *sseu,
-                    union intel_sseu ctx_sseu)
+u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
+                  union intel_sseu ctx_sseu)
 {
        u32 rpcs = 0;
 
@@ -2543,8 +2559,8 @@ static void execlists_init_reg_state(u32 *regs,
        if (rcs) {
                regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
                CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
-                       make_rpcs(&INTEL_INFO(dev_priv)->sseu,
-                                 ctx->__engine[engine->id].sseu));
+                       gen8_make_rpcs(&INTEL_INFO(dev_priv)->sseu,
+                                      ctx->__engine[engine->id].sseu));
 
                i915_oa_init_reg_state(engine, ctx, regs);
        }
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 4ec7d8dd13c8..f53e704a21b8 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -104,6 +104,9 @@ struct i915_gem_context;
 
 void intel_lr_context_resume(struct drm_i915_private *dev_priv);
 
+u32 gen8_make_rpcs(const struct sseu_dev_info *sseu,
+                  union intel_sseu ctx_sseu);
+
 static inline uint64_t
 intel_lr_context_descriptor(struct i915_gem_context *ctx,
                            struct intel_engine_cs *engine)
-- 
2.17.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to