Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request

2017-05-05 Thread Tvrtko Ursulin


On 05/05/2017 12:16, Chris Wilson wrote:

On Fri, May 05, 2017 at 11:49:21AM +0100, Tvrtko Ursulin wrote:


On 03/05/2017 12:37, Chris Wilson wrote:

+static void port_assign(struct execlist_port *port,
+   struct drm_i915_gem_request *rq)
+{
+   GEM_BUG_ON(rq == port_request(port));
+
+   if (port_isset(port))
+   i915_gem_request_put(port_request(port));
+
+   port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
+}


Reason for not having one implementation of port_assign with
enable_nested_signalling outside it in the guc case?


The handling of port.count is a big difference between guc and
ordinary execlists. For execlists we count contexts, for guc we count
requests.


Bah missed it, scrolling up and down and relying on memory is not a 
substitute for split screen..



static void execlists_dequeue(struct intel_engine_cs *engine)
{
struct drm_i915_gem_request *last;
@@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
struct rb_node *rb;
bool submit = false;

-   last = port->request;
+   last = port_request(port);
if (last)
/* WaIdleLiteRestore:bdw,skl
 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
@@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
 */
last->tail = last->wa_tail;

-   GEM_BUG_ON(port[1].request);
+   GEM_BUG_ON(port_isset([1]));

/* Hardware submission is through 2 ports. Conceptually each port
 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)

GEM_BUG_ON(last->ctx == cursor->ctx);

-   i915_gem_request_assign(>request, last);
+   if (submit)
+   port_assign(port, last);


Optimisation? GEM_BUG_ON(last != port_request(port))?


Kind of, it is so both paths look identical and yes so we do meet the
criteria that last != port_request(port) (because it is silly to the do
the request_count dance if the goal is not to change request_count).


Ok.


@@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
submit = true;
}
if (submit) {
-   i915_gem_request_assign(>request, last);
+   port_assign(port, last);
engine->execlist_first = rb;
}
spin_unlock_irq(>timeline->lock);
@@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
execlists_submit_ports(engine);
}

-static bool execlists_elsp_idle(struct intel_engine_cs *engine)
-{
-   return !engine->execlist_port[0].request;
-}
-
static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
{
const struct execlist_port *port = engine->execlist_port;

-   return port[0].count + port[1].count < 2;
+   return port_count([0]) + port_count([1]) < 2;
}

/*
@@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
tail = GEN8_CSB_WRITE_PTR(head);
head = GEN8_CSB_READ_PTR(head);
while (head != tail) {
+   struct drm_i915_gem_request *rq;
unsigned int status;
+   unsigned int count;

if (++head == GEN8_CSB_ENTRIES)
head = 0;
@@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
 port[0].context_id);

-   GEM_BUG_ON(port[0].count == 0);
-   if (--port[0].count == 0) {
+   rq = port_unpack([0], );
+   GEM_BUG_ON(count == 0);
+   if (--count == 0) {


If you changed this to be:

count--;
port_set(...);
if (count > 0)
continue;

It would be perhaps a bit more readable, but a potentially useless
port_set on the other hand. Not sure.


We expect to overwrite port in the common path, or at least that's my
expectation. Decrementing count for lite-restore should be the
exception. Part of the intention is to do the minimal amount of work
(especially avoiding the appearance of setting port.count = 0
prematurely) and pass the later port.count == 0 assert.


I've seen the mode where we just append and append and append with no 
requests out coming out in a while :), but I agree it is not the typical 
case. So as I said I am fine with this as it is.





GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
-   
GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
-   execlists_context_status_change(port[0].request,
-  

Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request

2017-05-05 Thread Chris Wilson
On Fri, May 05, 2017 at 11:49:21AM +0100, Tvrtko Ursulin wrote:
> 
> On 03/05/2017 12:37, Chris Wilson wrote:
> >+static void port_assign(struct execlist_port *port,
> >+struct drm_i915_gem_request *rq)
> >+{
> >+GEM_BUG_ON(rq == port_request(port));
> >+
> >+if (port_isset(port))
> >+i915_gem_request_put(port_request(port));
> >+
> >+port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
> >+}
> 
> Reason for not having one implementation of port_assign with
> enable_nested_signalling outside it in the guc case?

The handling of port.count is a big difference between guc and
ordinary execlists. For execlists we count contexts, for guc we count
requests.

> > static void execlists_dequeue(struct intel_engine_cs *engine)
> > {
> > struct drm_i915_gem_request *last;
> >@@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs 
> >*engine)
> > struct rb_node *rb;
> > bool submit = false;
> >
> >-last = port->request;
> >+last = port_request(port);
> > if (last)
> > /* WaIdleLiteRestore:bdw,skl
> >  * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> >@@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs 
> >*engine)
> >  */
> > last->tail = last->wa_tail;
> >
> >-GEM_BUG_ON(port[1].request);
> >+GEM_BUG_ON(port_isset([1]));
> >
> > /* Hardware submission is through 2 ports. Conceptually each port
> >  * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
> >@@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs 
> >*engine)
> >
> > GEM_BUG_ON(last->ctx == cursor->ctx);
> >
> >-i915_gem_request_assign(>request, last);
> >+if (submit)
> >+port_assign(port, last);
> 
> Optimisation? GEM_BUG_ON(last != port_request(port))?

Kind of, it is so both paths look identical and yes so we do meet the
criteria that last != port_request(port) (because it is silly to the do
the request_count dance if the goal is not to change request_count).

> >@@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs 
> >*engine)
> > submit = true;
> > }
> > if (submit) {
> >-i915_gem_request_assign(>request, last);
> >+port_assign(port, last);
> > engine->execlist_first = rb;
> > }
> > spin_unlock_irq(>timeline->lock);
> >@@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs 
> >*engine)
> > execlists_submit_ports(engine);
> > }
> >
> >-static bool execlists_elsp_idle(struct intel_engine_cs *engine)
> >-{
> >-return !engine->execlist_port[0].request;
> >-}
> >-
> > static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
> > {
> > const struct execlist_port *port = engine->execlist_port;
> >
> >-return port[0].count + port[1].count < 2;
> >+return port_count([0]) + port_count([1]) < 2;
> > }
> >
> > /*
> >@@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
> > tail = GEN8_CSB_WRITE_PTR(head);
> > head = GEN8_CSB_READ_PTR(head);
> > while (head != tail) {
> >+struct drm_i915_gem_request *rq;
> > unsigned int status;
> >+unsigned int count;
> >
> > if (++head == GEN8_CSB_ENTRIES)
> > head = 0;
> >@@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
> > GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
> >  port[0].context_id);
> >
> >-GEM_BUG_ON(port[0].count == 0);
> >-if (--port[0].count == 0) {
> >+rq = port_unpack([0], );
> >+GEM_BUG_ON(count == 0);
> >+if (--count == 0) {
> 
> If you changed this to be:
> 
> count--;
> port_set(...);
> if (count > 0)
>   continue;
> 
> It would be perhaps a bit more readable, but a potentially useless
> port_set on the other hand. Not sure.

We expect to overwrite port in the common path, or at least that's my
expectation. Decrementing count for lite-restore should be the
exception. Part of the intention is to do the minimal amount of work
(especially avoiding the appearance of setting port.count = 0
prematurely) and pass the later port.count == 0 assert.

> > GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
> >-
> >GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
> >-execlists_context_status_change(port[0].request,
> >-
> >INTEL_CONTEXT_SCHEDULE_OUT);
> >+GEM_BUG_ON(!i915_gem_request_completed(rq));
> >+

Re: [Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request

2017-05-05 Thread Tvrtko Ursulin


On 03/05/2017 12:37, Chris Wilson wrote:

add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
function old new   delta
execlists_submit_ports   262 471+209
port_assign.isra   - 136+136
capture 63446359 +15
reset_common_ring438 452 +14
execlists_submit_request 228 238 +10
gen8_init_common_ring334 341  +7
intel_engine_is_idle 106 105  -1
i915_engine_info23142290 -24
__i915_gem_set_wedged_BKL485 411 -74
intel_lrc_irq_handler   17891604-185
execlists_update_context 294   --294

The most important change there is the improve to the
intel_lrc_irq_handler and excclist_submit_ports (net improvement since
execlists_update_context is now inlined).

v2: Use the port_api() for guc as well (even though currently we do not
pack any counters in there, yet) and hide all port->request_count inside
the helpers.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_debugfs.c|  32 
 drivers/gpu/drm/i915/i915_gem.c|   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.c  |  13 +++-
 drivers/gpu/drm/i915/i915_guc_submission.c |  32 +---
 drivers/gpu/drm/i915/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/intel_lrc.c   | 117 -
 drivers/gpu/drm/i915/intel_ringbuffer.h|  10 ++-
 7 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..0b5d7142d8d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
if (i915.enable_execlists) {
u32 ptr, read, write;
struct rb_node *rb;
+   unsigned int idx;

seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3332,8 +,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
if (read > write)
write += GEN8_CSB_ENTRIES;
while (read < write) {
-   unsigned int idx = ++read % GEN8_CSB_ENTRIES;
-
+   idx = ++read % GEN8_CSB_ENTRIES;
seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: 
%d\n",
   idx,
   
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
@@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
}

rcu_read_lock();
-   rq = READ_ONCE(engine->execlist_port[0].request);
-   if (rq) {
-   seq_printf(m, "\t\tELSP[0] count=%d, ",
-  engine->execlist_port[0].count);
-   print_request(m, rq, "rq: ");
-   } else {
-   seq_printf(m, "\t\tELSP[0] idle\n");
-   }
-   rq = READ_ONCE(engine->execlist_port[1].request);
-   if (rq) {
-   seq_printf(m, "\t\tELSP[1] count=%d, ",
-  engine->execlist_port[1].count);
-   print_request(m, rq, "rq: ");
-   } else {
-   seq_printf(m, "\t\tELSP[1] idle\n");
+   for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); 
idx++) {
+   unsigned int count;
+
+   rq = port_unpack(>execlist_port[idx],
+);
+   if (rq) {
+   seq_printf(m, "\t\tELSP[%d] count=%d, ",
+  idx, count);
+   print_request(m, rq, "rq: ");
+   } else {
+   seq_printf(m, "\t\tELSP[%d] idle\n",
+  idx);
+   }
}
rcu_read_unlock();

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 

[Intel-gfx] [PATCH 3/9] drm/i915/execlists: Pack the count into the low bits of the port.request

2017-05-03 Thread Chris Wilson
add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
function old new   delta
execlists_submit_ports   262 471+209
port_assign.isra   - 136+136
capture 63446359 +15
reset_common_ring438 452 +14
execlists_submit_request 228 238 +10
gen8_init_common_ring334 341  +7
intel_engine_is_idle 106 105  -1
i915_engine_info23142290 -24
__i915_gem_set_wedged_BKL485 411 -74
intel_lrc_irq_handler   17891604-185
execlists_update_context 294   --294

The most important change there is the improve to the
intel_lrc_irq_handler and excclist_submit_ports (net improvement since
execlists_update_context is now inlined).

v2: Use the port_api() for guc as well (even though currently we do not
pack any counters in there, yet) and hide all port->request_count inside
the helpers.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_debugfs.c|  32 
 drivers/gpu/drm/i915/i915_gem.c|   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.c  |  13 +++-
 drivers/gpu/drm/i915/i915_guc_submission.c |  32 +---
 drivers/gpu/drm/i915/intel_engine_cs.c |   2 +-
 drivers/gpu/drm/i915/intel_lrc.c   | 117 -
 drivers/gpu/drm/i915/intel_ringbuffer.h|  10 ++-
 7 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..0b5d7142d8d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
if (i915.enable_execlists) {
u32 ptr, read, write;
struct rb_node *rb;
+   unsigned int idx;
 
seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3332,8 +,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
if (read > write)
write += GEN8_CSB_ENTRIES;
while (read < write) {
-   unsigned int idx = ++read % GEN8_CSB_ENTRIES;
-
+   idx = ++read % GEN8_CSB_ENTRIES;
seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, 
context: %d\n",
   idx,
   
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
@@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
}
 
rcu_read_lock();
-   rq = READ_ONCE(engine->execlist_port[0].request);
-   if (rq) {
-   seq_printf(m, "\t\tELSP[0] count=%d, ",
-  engine->execlist_port[0].count);
-   print_request(m, rq, "rq: ");
-   } else {
-   seq_printf(m, "\t\tELSP[0] idle\n");
-   }
-   rq = READ_ONCE(engine->execlist_port[1].request);
-   if (rq) {
-   seq_printf(m, "\t\tELSP[1] count=%d, ",
-  engine->execlist_port[1].count);
-   print_request(m, rq, "rq: ");
-   } else {
-   seq_printf(m, "\t\tELSP[1] idle\n");
+   for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); 
idx++) {
+   unsigned int count;
+
+   rq = port_unpack(>execlist_port[idx],
+);
+   if (rq) {
+   seq_printf(m, "\t\tELSP[%d] count=%d, ",
+  idx, count);
+   print_request(m, rq, "rq: ");
+   } else {
+   seq_printf(m, "\t\tELSP[%d] idle\n",
+  idx);
+   }
}
rcu_read_unlock();
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f79079f2e265..c1df4b9d2661 100644
---