On 03/05/2017 12:37, Chris Wilson wrote:
add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
function                                     old     new   delta
execlists_submit_ports                       262     471    +209
port_assign.isra                               -     136    +136
capture                                     6344    6359     +15
reset_common_ring                            438     452     +14
execlists_submit_request                     228     238     +10
gen8_init_common_ring                        334     341      +7
intel_engine_is_idle                         106     105      -1
i915_engine_info                            2314    2290     -24
__i915_gem_set_wedged_BKL                    485     411     -74
intel_lrc_irq_handler                       1789    1604    -185
execlists_update_context                     294       -    -294

The most important change there is the improve to the
intel_lrc_irq_handler and excclist_submit_ports (net improvement since
execlists_update_context is now inlined).

v2: Use the port_api() for guc as well (even though currently we do not
pack any counters in there, yet) and hide all port->request_count inside
the helpers.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuopp...@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  32 ++++----
 drivers/gpu/drm/i915/i915_gem.c            |   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.c      |  13 +++-
 drivers/gpu/drm/i915/i915_guc_submission.c |  32 +++++---
 drivers/gpu/drm/i915/intel_engine_cs.c     |   2 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 117 ++++++++++++++++-------------
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  10 ++-
 7 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..0b5d7142d8d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
                if (i915.enable_execlists) {
                        u32 ptr, read, write;
                        struct rb_node *rb;
+                       unsigned int idx;

                        seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
                                   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
                        if (read > write)
                                write += GEN8_CSB_ENTRIES;
                        while (read < write) {
-                               unsigned int idx = ++read % GEN8_CSB_ENTRIES;
-
+                               idx = ++read % GEN8_CSB_ENTRIES;
                                seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: 
%d\n",
                                           idx,
                                           
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
@@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
                        }

                        rcu_read_lock();
-                       rq = READ_ONCE(engine->execlist_port[0].request);
-                       if (rq) {
-                               seq_printf(m, "\t\tELSP[0] count=%d, ",
-                                          engine->execlist_port[0].count);
-                               print_request(m, rq, "rq: ");
-                       } else {
-                               seq_printf(m, "\t\tELSP[0] idle\n");
-                       }
-                       rq = READ_ONCE(engine->execlist_port[1].request);
-                       if (rq) {
-                               seq_printf(m, "\t\tELSP[1] count=%d, ",
-                                          engine->execlist_port[1].count);
-                               print_request(m, rq, "rq: ");
-                       } else {
-                               seq_printf(m, "\t\tELSP[1] idle\n");
+                       for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); 
idx++) {
+                               unsigned int count;
+
+                               rq = port_unpack(&engine->execlist_port[idx],
+                                                &count);
+                               if (rq) {
+                                       seq_printf(m, "\t\tELSP[%d] count=%d, ",
+                                                  idx, count);
+                                       print_request(m, rq, "rq: ");
+                               } else {
+                                       seq_printf(m, "\t\tELSP[%d] idle\n",
+                                                  idx);
+                               }
                        }
                        rcu_read_unlock();

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f79079f2e265..c1df4b9d2661 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3038,12 +3038,14 @@ static void engine_set_wedged(struct intel_engine_cs 
*engine)
         */

        if (i915.enable_execlists) {
+               struct execlist_port *port = engine->execlist_port;
                unsigned long flags;
+               unsigned int n;

                spin_lock_irqsave(&engine->timeline->lock, flags);

-               i915_gem_request_put(engine->execlist_port[0].request);
-               i915_gem_request_put(engine->execlist_port[1].request);
+               for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
+                       i915_gem_request_put(port_request(&port[n]));
                memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
                engine->execlist_queue = RB_ROOT;
                engine->execlist_first = NULL;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index ec526d92f908..e18f350bc364 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1324,12 +1324,17 @@ static void engine_record_requests(struct 
intel_engine_cs *engine,
 static void error_record_engine_execlists(struct intel_engine_cs *engine,
                                          struct drm_i915_error_engine *ee)
 {
+       const struct execlist_port *port = engine->execlist_port;
        unsigned int n;

-       for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
-               if (engine->execlist_port[n].request)
-                       record_request(engine->execlist_port[n].request,
-                                      &ee->execlist[n]);
+       for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
+               struct drm_i915_gem_request *rq = port_request(&port[n]);
+
+               if (!rq)
+                       break;
+
+               record_request(rq, &ee->execlist[n]);
+       }
 }

 static void record_context(struct drm_i915_error_context *e,
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 7e85b5ab8ae2..62d3831a8a0d 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -653,10 +653,22 @@ static void nested_enable_signaling(struct 
drm_i915_gem_request *rq)
        spin_unlock(&rq->lock);
 }

+static void port_assign(struct execlist_port *port,
+                       struct drm_i915_gem_request *rq)
+{
+       GEM_BUG_ON(rq == port_request(port));
+
+       if (port_isset(port))
+               i915_gem_request_put(port_request(port));
+
+       port_set(port, i915_gem_request_get(rq));
+       nested_enable_signaling(rq);
+}
+
 static bool i915_guc_dequeue(struct intel_engine_cs *engine)
 {
        struct execlist_port *port = engine->execlist_port;
-       struct drm_i915_gem_request *last = port[0].request;
+       struct drm_i915_gem_request *last = port_request(&port[0]);
        struct rb_node *rb;
        bool submit = false;

@@ -670,8 +682,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
                        if (port != engine->execlist_port)
                                break;

-                       i915_gem_request_assign(&port->request, last);
-                       nested_enable_signaling(last);
+                       port_assign(port, last);
                        port++;
                }

@@ -686,8 +697,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
                submit = true;
        }
        if (submit) {
-               i915_gem_request_assign(&port->request, last);
-               nested_enable_signaling(last);
+               port_assign(port, last);
                engine->execlist_first = rb;
        }
        spin_unlock_irq(&engine->timeline->lock);
@@ -703,17 +713,19 @@ static void i915_guc_irq_handler(unsigned long data)
        bool submit;

        do {
-               rq = port[0].request;
+               rq = port_request(&port[0]);
                while (rq && i915_gem_request_completed(rq)) {
                        trace_i915_gem_request_out(rq);
                        i915_gem_request_put(rq);
-                       port[0].request = port[1].request;
-                       port[1].request = NULL;
-                       rq = port[0].request;
+
+                       port[0] = port[1];
+                       memset(&port[1], 0, sizeof(port[1]));
+
+                       rq = port_request(&port[0]);
                }

                submit = false;
-               if (!port[1].request)
+               if (!port_count(&port[1]))
                        submit = i915_guc_dequeue(engine);
        } while (submit);
 }
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 6d3d83876da9..24659788e7a3 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1230,7 +1230,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
                return false;

        /* Both ports drained, no more ELSP submission? */
-       if (engine->execlist_port[0].request)
+       if (port_request(&engine->execlist_port[0]))
                return false;

        /* Ring stopped? */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0909549ad320..9f7b6112c53a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -337,39 +337,32 @@ static u64 execlists_update_context(struct 
drm_i915_gem_request *rq)

 static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
-       struct drm_i915_private *dev_priv = engine->i915;
        struct execlist_port *port = engine->execlist_port;
        u32 __iomem *elsp =
-               dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
-       u64 desc[2];
-
-       GEM_BUG_ON(port[0].count > 1);
-       if (!port[0].count)
-               execlists_context_status_change(port[0].request,
-                                               INTEL_CONTEXT_SCHEDULE_IN);
-       desc[0] = execlists_update_context(port[0].request);
-       GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
-       port[0].count++;
-
-       if (port[1].request) {
-               GEM_BUG_ON(port[1].count);
-               execlists_context_status_change(port[1].request,
-                                               INTEL_CONTEXT_SCHEDULE_IN);
-               desc[1] = execlists_update_context(port[1].request);
-               GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
-               port[1].count = 1;
-       } else {
-               desc[1] = 0;
-       }
-       GEM_BUG_ON(desc[0] == desc[1]);
+               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+       unsigned int n;

-       /* You must always write both descriptors in the order below. */
-       writel(upper_32_bits(desc[1]), elsp);
-       writel(lower_32_bits(desc[1]), elsp);
+       for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {
+               struct drm_i915_gem_request *rq;
+               unsigned int count;
+               u64 desc;
+
+               rq = port_unpack(&port[n], &count);
+               if (rq) {
+                       GEM_BUG_ON(count > !n);
+                       if (!count++)
+                               execlists_context_status_change(rq, 
INTEL_CONTEXT_SCHEDULE_IN);
+                       port_set(&port[n], port_pack(rq, count));
+                       desc = execlists_update_context(rq);
+                       GEM_DEBUG_EXEC(port[n].context_id = 
upper_32_bits(desc));
+               } else {
+                       GEM_BUG_ON(!n);
+                       desc = 0;
+               }

-       writel(upper_32_bits(desc[0]), elsp);
-       /* The context is automatically loaded after the following */
-       writel(lower_32_bits(desc[0]), elsp);
+               writel(upper_32_bits(desc), elsp);
+               writel(lower_32_bits(desc), elsp);
+       }
 }

 static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
@@ -390,6 +383,17 @@ static bool can_merge_ctx(const struct i915_gem_context 
*prev,
        return true;
 }

+static void port_assign(struct execlist_port *port,
+                       struct drm_i915_gem_request *rq)
+{
+       GEM_BUG_ON(rq == port_request(port));
+
+       if (port_isset(port))
+               i915_gem_request_put(port_request(port));
+
+       port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
+}

Reason for not having one implementation of port_assign with enable_nested_signalling outside it in the guc case?

+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
        struct drm_i915_gem_request *last;
@@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
        struct rb_node *rb;
        bool submit = false;

-       last = port->request;
+       last = port_request(port);
        if (last)
                /* WaIdleLiteRestore:bdw,skl
                 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
@@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
                 */
                last->tail = last->wa_tail;

-       GEM_BUG_ON(port[1].request);
+       GEM_BUG_ON(port_isset(&port[1]));

        /* Hardware submission is through 2 ports. Conceptually each port
         * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)

                        GEM_BUG_ON(last->ctx == cursor->ctx);

-                       i915_gem_request_assign(&port->request, last);
+                       if (submit)
+                               port_assign(port, last);

Optimisation? GEM_BUG_ON(last != port_request(port))?

                        port++;
                }

@@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
                submit = true;
        }
        if (submit) {
-               i915_gem_request_assign(&port->request, last);
+               port_assign(port, last);
                engine->execlist_first = rb;
        }
        spin_unlock_irq(&engine->timeline->lock);
@@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
                execlists_submit_ports(engine);
 }

-static bool execlists_elsp_idle(struct intel_engine_cs *engine)
-{
-       return !engine->execlist_port[0].request;
-}
-
 static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
 {
        const struct execlist_port *port = engine->execlist_port;

-       return port[0].count + port[1].count < 2;
+       return port_count(&port[0]) + port_count(&port[1]) < 2;
 }

 /*
@@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
                tail = GEN8_CSB_WRITE_PTR(head);
                head = GEN8_CSB_READ_PTR(head);
                while (head != tail) {
+                       struct drm_i915_gem_request *rq;
                        unsigned int status;
+                       unsigned int count;

                        if (++head == GEN8_CSB_ENTRIES)
                                head = 0;
@@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
                        GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
                                         port[0].context_id);

-                       GEM_BUG_ON(port[0].count == 0);
-                       if (--port[0].count == 0) {
+                       rq = port_unpack(&port[0], &count);
+                       GEM_BUG_ON(count == 0);
+                       if (--count == 0) {

If you changed this to be:

count--;
port_set(...);
if (count > 0)
        continue;

It would be perhaps a bit more readable, but a potentially useless port_set on the other hand. Not sure.

                                GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
-                               
GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
-                               execlists_context_status_change(port[0].request,
-                                                               
INTEL_CONTEXT_SCHEDULE_OUT);
+                               GEM_BUG_ON(!i915_gem_request_completed(rq));
+                               execlists_context_status_change(rq, 
INTEL_CONTEXT_SCHEDULE_OUT);
+
+                               trace_i915_gem_request_out(rq);
+                               i915_gem_request_put(rq);

-                               trace_i915_gem_request_out(port[0].request);
-                               i915_gem_request_put(port[0].request);
                                port[0] = port[1];
                                memset(&port[1], 0, sizeof(port[1]));
+                       } else {
+                               port_set(&port[0], port_pack(rq, count));
                        }

-                       GEM_BUG_ON(port[0].count == 0 &&
+                       /* After the final element, the hw should be idle */
+                       GEM_BUG_ON(port_count(&port[0]) == 0 &&
                                   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
                }

@@ -1148,6 +1154,7 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*engine)
        struct drm_i915_private *dev_priv = engine->i915;
        struct execlist_port *port = engine->execlist_port;
        unsigned int n;
+       bool submit;
        int ret;

        ret = intel_mocs_init_engine(engine);
@@ -1169,19 +1176,21 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*engine)
        /* After a GPU reset, we may have requests to replay */
        clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);

+       submit = false;
        for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
-               if (!port[n].request)
+               if (!port_isset(&port[n]))
                        break;

                DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
                                 engine->name, n,
-                                port[n].request->global_seqno);
+                                port_request(&port[n])->global_seqno);

                /* Discard the current inflight count */
-               port[n].count = 0;
+               port_set(&port[n], port_request(&port[n]));
+               submit = true;
        }

-       if (!i915.enable_guc_submission && !execlists_elsp_idle(engine))
+       if (submit && !i915.enable_guc_submission)
                execlists_submit_ports(engine);

        return 0;
@@ -1259,13 +1268,13 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
        intel_ring_update_space(request->ring);

        /* Catch up with any missed context-switch interrupts */
-       if (request->ctx != port[0].request->ctx) {
-               i915_gem_request_put(port[0].request);
+       if (request->ctx != port_request(&port[0])->ctx) {
+               i915_gem_request_put(port_request(&port[0]));
                port[0] = port[1];
                memset(&port[1], 0, sizeof(port[1]));
        }

-       GEM_BUG_ON(request->ctx != port[0].request->ctx);
+       GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);

        /* Reset WaIdleLiteRestore:bdw,skl as well */
        request->tail =
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4a599e3480a9..68765d45116b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -368,8 +368,14 @@ struct intel_engine_cs {
        /* Execlists */
        struct tasklet_struct irq_tasklet;
        struct execlist_port {
-               struct drm_i915_gem_request *request;
-               unsigned int count;
+               struct drm_i915_gem_request *request_count;
+#define EXECLIST_COUNT_BITS 2
+#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
+#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
+#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
+#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, 
EXECLIST_COUNT_BITS)
+#define port_set(p, packed) ((p)->request_count = (packed))
+#define port_isset(p) ((p)->request_count)
                GEM_DEBUG_DECL(u32 context_id);
        } execlist_port[2];
        struct rb_root execlist_queue;


Looks correct but I am still having trouble accepting the structure shrink and code savings are worth having less readable code.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to