From: Tvrtko Ursulin <[email protected]>

GCC cannot optimize well calculations hidden in macros and
assigned to temporary structures. We can cache the register in
ELSP write, and refactor reading of the CSB a bit to enable
it to do a better job.

Code is still equally readable but the generated body of the
CSB read loop is 30% smaller, and since that loop runs at
least once per interrupt, which in turn can fire in tens or
hundreds thousands times per second, must be of some value.

Signed-off-by: Tvrtko Ursulin <[email protected]>
---
 drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
 drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3a23b9549f7b..67592f8354d6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -361,8 +361,8 @@ static void execlists_elsp_write(struct 
drm_i915_gem_request *rq0,
 {
 
        struct intel_engine_cs *engine = rq0->engine;
-       struct drm_device *dev = engine->dev;
-       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct drm_i915_private *dev_priv = rq0->i915;
+       i915_reg_t elsp_reg = RING_ELSP(engine);
        uint64_t desc[2];
 
        if (rq1) {
@@ -376,12 +376,12 @@ static void execlists_elsp_write(struct 
drm_i915_gem_request *rq0,
        rq0->elsp_submitted++;
 
        /* You must always write both descriptors in the order below. */
-       I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-       I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+       I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
+       I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
 
-       I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
+       I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
        /* The context is automatically loaded after the following */
-       I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
+       I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
 
        /* ELSP is a wo register, use another nearby reg for posting */
        POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
@@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs 
*engine, u32 request_id)
 }
 
 static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-                  u32 *context_id)
+get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
+                  unsigned int read_pointer, u32 *context_id)
 {
-       struct drm_i915_private *dev_priv = engine->dev->dev_private;
        u32 status;
 
        read_pointer %= GEN8_CSB_ENTRIES;
 
-       status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
+       status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
 
        if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
                return 0;
 
-       *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
-                                                             read_pointer));
+       *context_id = I915_READ_FW(RING_CSB_HI(csb_base, read_pointer));
 
        return status;
 }
@@ -548,6 +546,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
        struct drm_i915_private *dev_priv = engine->dev->dev_private;
        u32 status_pointer;
        unsigned int read_pointer, write_pointer;
+       u32 csb_base = RING_CSB_BASE(engine);
        u32 csb[GEN8_CSB_ENTRIES][2];
        unsigned int csb_read = 0, i;
        unsigned int submit_contexts = 0;
@@ -565,7 +564,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
        while (read_pointer < write_pointer) {
                if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
                        break;
-               csb[csb_read][0] = get_context_status(engine, ++read_pointer,
+               csb[csb_read][0] = get_context_status(dev_priv, csb_base,
+                                                     ++read_pointer,
                                                      &csb[csb_read][1]);
                csb_read++;
        }
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a17cb12221ba..6690d93d603f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -34,9 +34,14 @@
 #define          CTX_CTRL_INHIBIT_SYN_CTX_SWITCH       (1 << 3)
 #define          CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT   (1 << 0)
 #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
-#define RING_CONTEXT_STATUS_BUF_LO(ring, i)    _MMIO((ring)->mmio_base + 0x370 
+ (i) * 8)
-#define RING_CONTEXT_STATUS_BUF_HI(ring, i)    _MMIO((ring)->mmio_base + 0x370 
+ (i) * 8 + 4)
-#define RING_CONTEXT_STATUS_PTR(ring)          _MMIO((ring)->mmio_base + 0x3a0)
+
+#define RING_CSB_BASE(ring)            ((ring)->mmio_base + 0x370)
+#define RING_CSB_LO(csb_base, i)       _MMIO((csb_base) + (i) * 8)
+#define RING_CSB_HI(csb_base, i)       _MMIO((csb_base) + (i) * 8 + 4)
+
+#define RING_CONTEXT_STATUS_BUF_LO(ring, i)  RING_CSB_LO(RING_CSB_BASE(ring), 
i)
+#define RING_CONTEXT_STATUS_BUF_HI(ring, i)  RING_CSB_HI(RING_CSB_BASE(ring), 
i)
+#define RING_CONTEXT_STATUS_PTR(ring)       _MMIO((ring)->mmio_base + 0x3a0)
 
 /* The docs specify that the write pointer wraps around after 5h, "After status
  * is written out to the last available status QW at offset 5h, this pointer
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to