On 13/07/17 03:27, Chris Wilson wrote:
The engine provides a mirror of the CSB in the HWSP. If we use the
cacheable reads from the HWSP, we can shave off a few mmio reads per
context-switch interrupt (which are quite frequent!). Just removing a
couple of mmio is not enough to actually reduce any latency, but a small
reduction in overall cpu usage.

Much appreciation for Ben dropping the bombshell that the CSB was in the
HWSP and for Michel in digging out the details.

v2: Don't be lazy, add the defines for the indices.
v3: Include the HWSP in debugfs/i915_engine_info
v4: Check for GVT-g, it currently depends on intercepting CSB mmio

Suggested-by: Ben Widawsky <benjamin.widaw...@intel.com>
Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thie...@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
Cc: Mika Kuoppala <mika.kuopp...@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospu...@intel.com>
Cc: Zhenyu Wang <zhen...@linux.intel.com>
Cc: Zhi Wang <zhi.a.w...@intel.com>
Acked-by: Michel Thierry <michel.thie...@intel.com>
---
  drivers/gpu/drm/i915/i915_debugfs.c     |  7 +++++--
  drivers/gpu/drm/i915/intel_lrc.c        | 16 +++++++++++-----
  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
  3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 620c9218d1c1..5fd01c14a3ec 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3384,6 +3384,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
                           upper_32_bits(addr), lower_32_bits(addr));
if (i915.enable_execlists) {
+                       const u32 *hws = 
&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
                        u32 ptr, read, write;
                        unsigned int idx;
@@ -3404,10 +3405,12 @@ static int i915_engine_info(struct seq_file *m, void *unused)
                                write += GEN8_CSB_ENTRIES;
                        while (read < write) {
                                idx = ++read % GEN8_CSB_ENTRIES;
-                               seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: 
%d\n",
+                               seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in 
hwsp], context: %d [%d in hwsp]\n",
                                           idx,
                                           
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
-                                          
I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)));
+                                          hws[idx * 2],
+                                          
I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)),
+                                          hws[idx * 2 + 1]);
                        }
rcu_read_lock();
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3469badedbe0..5b721f65d232 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -547,10 +547,17 @@ static void intel_lrc_irq_handler(unsigned long data)
        while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
                u32 __iomem *csb_mmio =
                        dev_priv->regs + 
i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
-               u32 __iomem *buf =
-                       dev_priv->regs + 
i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
+               /* The HWSP contains a (cacheable) mirror of the CSB */
+               const u32 *buf =
+                       &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
                unsigned int head, tail;
+ /* However GVT emulation depends upon intercepting CSB mmio */
+               if (unlikely(intel_vgpu_active(dev_priv))) {
+                       buf = (u32 * __force)
+                               (dev_priv->regs + 
i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));

RING_CONTEXT_STATUS_BUF_LO(engine, 0) instead of RING_CONTEXT_STATUS_PTR(engine)?

+               }
+
                /* The write will be ordered by the uncached read (itself
                 * a memory barrier), so we do not need another in the form
                 * of a locked instruction. The race between the interrupt
@@ -590,13 +597,12 @@ static void intel_lrc_irq_handler(unsigned long data)
                         * status notifier.
                         */
- status = readl(buf + 2 * head);
+                       status = buf[2 * head];
                        if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
                                continue;
/* Check the context/desc id for this event matches */
-                       GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
-                                        port->context_id);
+                       GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
rq = port_unpack(port, &count);
                        GEM_BUG_ON(count == 0);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d33c93444c0d..2c55cfa14fb5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -496,6 +496,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int 
reg, u32 value)
  #define I915_GEM_HWS_SCRATCH_INDEX    0x40
  #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << 
MI_STORE_DWORD_INDEX_SHIFT)
+#define I915_HWS_CSB_BUF0_INDEX 0x10
+
  struct intel_ring *
  intel_engine_create_ring(struct intel_engine_cs *engine, int size);
  int intel_ring_pin(struct intel_ring *ring,

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to