Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request
On 05/05/2017 12:16, Chris Wilson wrote: On Fri, May 05, 2017 at 11:49:21AM +0100, Tvrtko Ursulin wrote: On 03/05/2017 12:37, Chris Wilson wrote: +static void port_assign(struct execlist_port *port, + struct drm_i915_gem_request *rq) +{ + GEM_BUG_ON(rq == port_request(port)); + + if (port_isset(port)) + i915_gem_request_put(port_request(port)); + + port_set(port, port_pack(i915_gem_request_get(rq), port_count(port))); +} Reason for not having one implementation of port_assign with enable_nested_signalling outside it in the guc case? The handling of port.count is a big difference between guc and ordinary execlists. For execlists we count contexts, for guc we count requests. Bah missed it, scrolling up and down and relying on memory is not a substitute for split screen.. static void execlists_dequeue(struct intel_engine_cs *engine) { struct drm_i915_gem_request *last; @@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) struct rb_node *rb; bool submit = false; - last = port->request; + last = port_request(port); if (last) /* WaIdleLiteRestore:bdw,skl * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL @@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ last->tail = last->wa_tail; - GEM_BUG_ON(port[1].request); + GEM_BUG_ON(port_isset([1])); /* Hardware submission is through 2 ports. Conceptually each port * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is @@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine) GEM_BUG_ON(last->ctx == cursor->ctx); - i915_gem_request_assign(>request, last); + if (submit) + port_assign(port, last); Optimisation? GEM_BUG_ON(last != port_request(port))? Kind of, it is so both paths look identical and yes so we do meet the criteria that last != port_request(port) (because it is silly to the do the request_count dance if the goal is not to change request_count). Ok. @@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) submit = true; } if (submit) { - i915_gem_request_assign(>request, last); + port_assign(port, last); engine->execlist_first = rb; } spin_unlock_irq(>timeline->lock); @@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine) execlists_submit_ports(engine); } -static bool execlists_elsp_idle(struct intel_engine_cs *engine) -{ - return !engine->execlist_port[0].request; -} - static bool execlists_elsp_ready(const struct intel_engine_cs *engine) { const struct execlist_port *port = engine->execlist_port; - return port[0].count + port[1].count < 2; + return port_count([0]) + port_count([1]) < 2; } /* @@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data) tail = GEN8_CSB_WRITE_PTR(head); head = GEN8_CSB_READ_PTR(head); while (head != tail) { + struct drm_i915_gem_request *rq; unsigned int status; + unsigned int count; if (++head == GEN8_CSB_ENTRIES) head = 0; @@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data) GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != port[0].context_id); - GEM_BUG_ON(port[0].count == 0); - if (--port[0].count == 0) { + rq = port_unpack([0], ); + GEM_BUG_ON(count == 0); + if (--count == 0) { If you changed this to be: count--; port_set(...); if (count > 0) continue; It would be perhaps a bit more readable, but a potentially useless port_set on the other hand. Not sure. We expect to overwrite port in the common path, or at least that's my expectation. Decrementing count for lite-restore should be the exception. Part of the intention is to do the minimal amount of work (especially avoiding the appearance of setting port.count = 0 prematurely) and pass the later port.count == 0 assert. I've seen the mode where we just append and append and append with no requests out coming out in a while :), but I agree it is not the typical case. So as I said I am fine with this as it is. GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); - GEM_BUG_ON(!i915_gem_request_completed(port[0].request)); - execlists_context_status_change(port[0].request, -
Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request
On Fri, May 05, 2017 at 11:49:21AM +0100, Tvrtko Ursulin wrote: > > On 03/05/2017 12:37, Chris Wilson wrote: > >+static void port_assign(struct execlist_port *port, > >+struct drm_i915_gem_request *rq) > >+{ > >+GEM_BUG_ON(rq == port_request(port)); > >+ > >+if (port_isset(port)) > >+i915_gem_request_put(port_request(port)); > >+ > >+port_set(port, port_pack(i915_gem_request_get(rq), port_count(port))); > >+} > > Reason for not having one implementation of port_assign with > enable_nested_signalling outside it in the guc case? The handling of port.count is a big difference between guc and ordinary execlists. For execlists we count contexts, for guc we count requests. > > static void execlists_dequeue(struct intel_engine_cs *engine) > > { > > struct drm_i915_gem_request *last; > >@@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs > >*engine) > > struct rb_node *rb; > > bool submit = false; > > > >-last = port->request; > >+last = port_request(port); > > if (last) > > /* WaIdleLiteRestore:bdw,skl > > * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL > >@@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs > >*engine) > > */ > > last->tail = last->wa_tail; > > > >-GEM_BUG_ON(port[1].request); > >+GEM_BUG_ON(port_isset([1])); > > > > /* Hardware submission is through 2 ports. Conceptually each port > > * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is > >@@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs > >*engine) > > > > GEM_BUG_ON(last->ctx == cursor->ctx); > > > >-i915_gem_request_assign(>request, last); > >+if (submit) > >+port_assign(port, last); > > Optimisation? GEM_BUG_ON(last != port_request(port))? Kind of, it is so both paths look identical and yes so we do meet the criteria that last != port_request(port) (because it is silly to the do the request_count dance if the goal is not to change request_count). > >@@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs > >*engine) > > submit = true; > > } > > if (submit) { > >-i915_gem_request_assign(>request, last); > >+port_assign(port, last); > > engine->execlist_first = rb; > > } > > spin_unlock_irq(>timeline->lock); > >@@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs > >*engine) > > execlists_submit_ports(engine); > > } > > > >-static bool execlists_elsp_idle(struct intel_engine_cs *engine) > >-{ > >-return !engine->execlist_port[0].request; > >-} > >- > > static bool execlists_elsp_ready(const struct intel_engine_cs *engine) > > { > > const struct execlist_port *port = engine->execlist_port; > > > >-return port[0].count + port[1].count < 2; > >+return port_count([0]) + port_count([1]) < 2; > > } > > > > /* > >@@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data) > > tail = GEN8_CSB_WRITE_PTR(head); > > head = GEN8_CSB_READ_PTR(head); > > while (head != tail) { > >+struct drm_i915_gem_request *rq; > > unsigned int status; > >+unsigned int count; > > > > if (++head == GEN8_CSB_ENTRIES) > > head = 0; > >@@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data) > > GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != > > port[0].context_id); > > > >-GEM_BUG_ON(port[0].count == 0); > >-if (--port[0].count == 0) { > >+rq = port_unpack([0], ); > >+GEM_BUG_ON(count == 0); > >+if (--count == 0) { > > If you changed this to be: > > count--; > port_set(...); > if (count > 0) > continue; > > It would be perhaps a bit more readable, but a potentially useless > port_set on the other hand. Not sure. We expect to overwrite port in the common path, or at least that's my expectation. Decrementing count for lite-restore should be the exception. Part of the intention is to do the minimal amount of work (especially avoiding the appearance of setting port.count = 0 prematurely) and pass the later port.count == 0 assert. > > GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); > >- > >GEM_BUG_ON(!i915_gem_request_completed(port[0].request)); > >-execlists_context_status_change(port[0].request, > >- > >INTEL_CONTEXT_SCHEDULE_OUT); > >+GEM_BUG_ON(!i915_gem_request_completed(rq)); > >+
Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request
On 03/05/2017 12:37, Chris Wilson wrote: add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187) function old new delta execlists_submit_ports 262 471+209 port_assign.isra - 136+136 capture 63446359 +15 reset_common_ring438 452 +14 execlists_submit_request 228 238 +10 gen8_init_common_ring334 341 +7 intel_engine_is_idle 106 105 -1 i915_engine_info23142290 -24 __i915_gem_set_wedged_BKL485 411 -74 intel_lrc_irq_handler 17891604-185 execlists_update_context 294 --294 The most important change there is the improve to the intel_lrc_irq_handler and excclist_submit_ports (net improvement since execlists_update_context is now inlined). v2: Use the port_api() for guc as well (even though currently we do not pack any counters in there, yet) and hide all port->request_count inside the helpers. Signed-off-by: Chris WilsonCc: Mika Kuoppala Cc: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_debugfs.c| 32 drivers/gpu/drm/i915/i915_gem.c| 6 +- drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++- drivers/gpu/drm/i915/i915_guc_submission.c | 32 +--- drivers/gpu/drm/i915/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/intel_lrc.c | 117 - drivers/gpu/drm/i915/intel_ringbuffer.h| 10 ++- 7 files changed, 122 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 870c470177b5..0b5d7142d8d9 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (i915.enable_execlists) { u32 ptr, read, write; struct rb_node *rb; + unsigned int idx; seq_printf(m, "\tExeclist status: 0x%08x %08x\n", I915_READ(RING_EXECLIST_STATUS_LO(engine)), @@ -3332,8 +,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (read > write) write += GEN8_CSB_ENTRIES; while (read < write) { - unsigned int idx = ++read % GEN8_CSB_ENTRIES; - + idx = ++read % GEN8_CSB_ENTRIES; seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", idx, I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)), @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused) } rcu_read_lock(); - rq = READ_ONCE(engine->execlist_port[0].request); - if (rq) { - seq_printf(m, "\t\tELSP[0] count=%d, ", - engine->execlist_port[0].count); - print_request(m, rq, "rq: "); - } else { - seq_printf(m, "\t\tELSP[0] idle\n"); - } - rq = READ_ONCE(engine->execlist_port[1].request); - if (rq) { - seq_printf(m, "\t\tELSP[1] count=%d, ", - engine->execlist_port[1].count); - print_request(m, rq, "rq: "); - } else { - seq_printf(m, "\t\tELSP[1] idle\n"); + for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) { + unsigned int count; + + rq = port_unpack(>execlist_port[idx], +); + if (rq) { + seq_printf(m, "\t\tELSP[%d] count=%d, ", + idx, count); + print_request(m, rq, "rq: "); + } else { + seq_printf(m, "\t\tELSP[%d] idle\n", + idx); + } } rcu_read_unlock(); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index
[Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request
add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187) function old new delta execlists_submit_ports 262 471+209 port_assign.isra - 136+136 capture 63446359 +15 reset_common_ring438 452 +14 execlists_submit_request 228 238 +10 gen8_init_common_ring334 341 +7 intel_engine_is_idle 106 105 -1 i915_engine_info23142290 -24 __i915_gem_set_wedged_BKL485 411 -74 intel_lrc_irq_handler 17891604-185 execlists_update_context 294 --294 The most important change there is the improve to the intel_lrc_irq_handler and excclist_submit_ports (net improvement since execlists_update_context is now inlined). v2: Use the port_api() for guc as well (even though currently we do not pack any counters in there, yet) and hide all port->request_count inside the helpers. Signed-off-by: Chris WilsonCc: Mika Kuoppala Cc: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_debugfs.c| 32 drivers/gpu/drm/i915/i915_gem.c| 6 +- drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++- drivers/gpu/drm/i915/i915_guc_submission.c | 32 +--- drivers/gpu/drm/i915/intel_engine_cs.c | 2 +- drivers/gpu/drm/i915/intel_lrc.c | 117 - drivers/gpu/drm/i915/intel_ringbuffer.h| 10 ++- 7 files changed, 122 insertions(+), 90 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 870c470177b5..0b5d7142d8d9 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (i915.enable_execlists) { u32 ptr, read, write; struct rb_node *rb; + unsigned int idx; seq_printf(m, "\tExeclist status: 0x%08x %08x\n", I915_READ(RING_EXECLIST_STATUS_LO(engine)), @@ -3332,8 +,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (read > write) write += GEN8_CSB_ENTRIES; while (read < write) { - unsigned int idx = ++read % GEN8_CSB_ENTRIES; - + idx = ++read % GEN8_CSB_ENTRIES; seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", idx, I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)), @@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused) } rcu_read_lock(); - rq = READ_ONCE(engine->execlist_port[0].request); - if (rq) { - seq_printf(m, "\t\tELSP[0] count=%d, ", - engine->execlist_port[0].count); - print_request(m, rq, "rq: "); - } else { - seq_printf(m, "\t\tELSP[0] idle\n"); - } - rq = READ_ONCE(engine->execlist_port[1].request); - if (rq) { - seq_printf(m, "\t\tELSP[1] count=%d, ", - engine->execlist_port[1].count); - print_request(m, rq, "rq: "); - } else { - seq_printf(m, "\t\tELSP[1] idle\n"); + for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) { + unsigned int count; + + rq = port_unpack(>execlist_port[idx], +); + if (rq) { + seq_printf(m, "\t\tELSP[%d] count=%d, ", + idx, count); + print_request(m, rq, "rq: "); + } else { + seq_printf(m, "\t\tELSP[%d] idle\n", + idx); + } } rcu_read_unlock(); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index f79079f2e265..c1df4b9d2661 100644 ---