Re: [Intel-gfx] [PATCH v4] drm/i915/scheduler: emulate a scheduler for guc

2017-01-11 Thread Chris Wilson
On Thu, Jan 12, 2017 at 07:14:54AM +, Chris Wilson wrote:
> On Thu, Jan 12, 2017 at 07:02:56AM +, Tvrtko Ursulin wrote:
> > 
> > On 11/01/2017 21:24, Chris Wilson wrote:
> > >This emulates execlists on top of the GuC in order to defer submission of
> > >requests to the hardware. This deferral allows time for high priority
> > >requests to gazump their way to the head of the queue, however it nerfs
> > >the GuC by converting it back into a simple execlist (where the CPU has
> > >to wake up after every request to feed new commands into the GuC).
> > >
> > >v2: Drop hack status - though iirc there is still a lockdep inversion
> > >between fence and engine->timeline->lock (which is impossible as the
> > >nesting only occurs on different fences - hopefully just requires some
> > >judicious lockdep annotation)
> > >v3: Apply lockdep nesting to enabling signaling on the request, using
> > >the pattern we already have in __i915_gem_request_submit();
> > >v4: Replaying requests after a hang also now needs the timeline
> > >spinlock, to disable the interrupts at least
> > >
> > >Signed-off-by: Chris Wilson 
> > >Cc: Tvrtko Ursulin 
> > >---
> > > drivers/gpu/drm/i915/i915_guc_submission.c | 95 
> > > +++---
> > > drivers/gpu/drm/i915/i915_irq.c|  4 +-
> > > drivers/gpu/drm/i915/intel_lrc.c   |  5 +-
> > > 3 files changed, 92 insertions(+), 12 deletions(-)
> > >
> > >diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
> > >b/drivers/gpu/drm/i915/i915_guc_submission.c
> > >index 913d87358972..2f0a853f820a 100644
> > >--- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > >+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > >@@ -350,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
> > >*request)
> > >   u32 freespace;
> > >   int ret;
> > >
> > >-  spin_lock(>wq_lock);
> > >+  spin_lock_irq(>wq_lock);
> > >   freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
> > >   freespace -= client->wq_rsvd;
> > >   if (likely(freespace >= wqi_size)) {
> > >@@ -360,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
> > >*request)
> > >   client->no_wq_space++;
> > >   ret = -EAGAIN;
> > >   }
> > >-  spin_unlock(>wq_lock);
> > >+  spin_unlock_irq(>wq_lock);
> > >
> > >   return ret;
> > > }
> > >@@ -372,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
> > >*request)
> > >
> > >   GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> > >
> > >-  spin_lock(>wq_lock);
> > >+  spin_lock_irq(>wq_lock);
> > >   client->wq_rsvd -= wqi_size;
> > >-  spin_unlock(>wq_lock);
> > >+  spin_unlock_irq(>wq_lock);
> > > }
> > >
> > > /* Construct a Work Item and append it to the GuC's Work Queue */
> > >@@ -534,10 +534,87 @@ static void __i915_guc_submit(struct 
> > >drm_i915_gem_request *rq)
> > >
> > > static void i915_guc_submit(struct drm_i915_gem_request *rq)
> > > {
> > >-  i915_gem_request_submit(rq);
> > >+  __i915_gem_request_submit(rq);
> > >   __i915_guc_submit(rq);
> > > }
> > >
> > >+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> > >+{
> > >+  if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> > >+   >fence.flags))
> > >+  return;
> > 
> > You didn't like the idea of duplicating the tracepoint from
> > dma_fence_enable_sw_signalling here?
> 
> No, I was just forgetful and worrying about the S3 hangs.

The verdict is that guc + S3 == death atm. From a pure enable guc for CI
run: https://intel-gfx-ci.01.org/CI/Trybot_468/
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v4] drm/i915/scheduler: emulate a scheduler for guc

2017-01-11 Thread Chris Wilson
On Thu, Jan 12, 2017 at 07:02:56AM +, Tvrtko Ursulin wrote:
> 
> On 11/01/2017 21:24, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> >
> >v2: Drop hack status - though iirc there is still a lockdep inversion
> >between fence and engine->timeline->lock (which is impossible as the
> >nesting only occurs on different fences - hopefully just requires some
> >judicious lockdep annotation)
> >v3: Apply lockdep nesting to enabling signaling on the request, using
> >the pattern we already have in __i915_gem_request_submit();
> >v4: Replaying requests after a hang also now needs the timeline
> >spinlock, to disable the interrupts at least
> >
> >Signed-off-by: Chris Wilson 
> >Cc: Tvrtko Ursulin 
> >---
> > drivers/gpu/drm/i915/i915_guc_submission.c | 95 
> > +++---
> > drivers/gpu/drm/i915/i915_irq.c|  4 +-
> > drivers/gpu/drm/i915/intel_lrc.c   |  5 +-
> > 3 files changed, 92 insertions(+), 12 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
> >b/drivers/gpu/drm/i915/i915_guc_submission.c
> >index 913d87358972..2f0a853f820a 100644
> >--- a/drivers/gpu/drm/i915/i915_guc_submission.c
> >+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> >@@ -350,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
> >*request)
> > u32 freespace;
> > int ret;
> >
> >-spin_lock(>wq_lock);
> >+spin_lock_irq(>wq_lock);
> > freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
> > freespace -= client->wq_rsvd;
> > if (likely(freespace >= wqi_size)) {
> >@@ -360,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
> >*request)
> > client->no_wq_space++;
> > ret = -EAGAIN;
> > }
> >-spin_unlock(>wq_lock);
> >+spin_unlock_irq(>wq_lock);
> >
> > return ret;
> > }
> >@@ -372,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
> >*request)
> >
> > GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> >
> >-spin_lock(>wq_lock);
> >+spin_lock_irq(>wq_lock);
> > client->wq_rsvd -= wqi_size;
> >-spin_unlock(>wq_lock);
> >+spin_unlock_irq(>wq_lock);
> > }
> >
> > /* Construct a Work Item and append it to the GuC's Work Queue */
> >@@ -534,10 +534,87 @@ static void __i915_guc_submit(struct 
> >drm_i915_gem_request *rq)
> >
> > static void i915_guc_submit(struct drm_i915_gem_request *rq)
> > {
> >-i915_gem_request_submit(rq);
> >+__i915_gem_request_submit(rq);
> > __i915_guc_submit(rq);
> > }
> >
> >+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> >+{
> >+if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> >+ >fence.flags))
> >+return;
> 
> You didn't like the idea of duplicating the tracepoint from
> dma_fence_enable_sw_signalling here?

No, I was just forgetful and worrying about the S3 hangs.

> > /* Replay the current set of previously submitted requests */
> >+spin_lock_irqsave(>timeline->lock, flags);
> > list_for_each_entry(rq, >timeline->requests, link) {
> > client->wq_rsvd += sizeof(struct guc_wq_item);
> 
> Strictly speaking the guc client wq lock as well so maybe just put a
> comment here saying it is not needed or maybe just take it.

True.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v4] drm/i915/scheduler: emulate a scheduler for guc

2017-01-11 Thread Tvrtko Ursulin


On 11/01/2017 21:24, Chris Wilson wrote:

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least

Signed-off-by: Chris Wilson 
Cc: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 95 +++---
 drivers/gpu/drm/i915/i915_irq.c|  4 +-
 drivers/gpu/drm/i915/intel_lrc.c   |  5 +-
 3 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 913d87358972..2f0a853f820a 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -350,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
u32 freespace;
int ret;

-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
freespace -= client->wq_rsvd;
if (likely(freespace >= wqi_size)) {
@@ -360,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
client->no_wq_space++;
ret = -EAGAIN;
}
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);

return ret;
 }
@@ -372,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
*request)

GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);

-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
client->wq_rsvd -= wqi_size;
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 }

 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -534,10 +534,87 @@ static void __i915_guc_submit(struct drm_i915_gem_request 
*rq)

 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-   i915_gem_request_submit(rq);
+   __i915_gem_request_submit(rq);
__i915_guc_submit(rq);
 }

+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+   if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+>fence.flags))
+   return;


You didn't like the idea of duplicating the tracepoint from 
dma_fence_enable_sw_signalling here?



+
+   GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >fence.flags));
+
+   spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
+   intel_engine_enable_signaling(rq);
+   spin_unlock(>lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *last = port[0].request;
+   unsigned long flags;
+   struct rb_node *rb;
+   bool submit = false;
+
+   spin_lock_irqsave(>timeline->lock, flags);
+   rb = engine->execlist_first;
+   while (rb) {
+   struct drm_i915_gem_request *cursor =
+   rb_entry(rb, typeof(*cursor), priotree.node);
+
+   if (last && cursor->ctx != last->ctx) {
+   if (port != engine->execlist_port)
+   break;
+
+   i915_gem_request_assign(>request, last);
+   nested_enable_signaling(last);
+   port++;
+   }
+
+   rb = rb_next(rb);
+   rb_erase(>priotree.node, >execlist_queue);
+   RB_CLEAR_NODE(>priotree.node);
+   cursor->priotree.priority = INT_MAX;
+
+   i915_guc_submit(cursor);
+   last = cursor;
+   submit = true;
+   }
+   if (submit) {
+   i915_gem_request_assign(>request, last);
+   nested_enable_signaling(last);
+   engine->execlist_first = rb;
+   }
+   spin_unlock_irqrestore(>timeline->lock, flags);
+
+   return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+   struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *rq;
+   bool submit;
+
+   do {
+   rq = port[0].request;
+   while (rq && 

[Intel-gfx] [PATCH v4] drm/i915/scheduler: emulate a scheduler for guc

2017-01-11 Thread Chris Wilson
This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least

Signed-off-by: Chris Wilson 
Cc: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 95 +++---
 drivers/gpu/drm/i915/i915_irq.c|  4 +-
 drivers/gpu/drm/i915/intel_lrc.c   |  5 +-
 3 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 913d87358972..2f0a853f820a 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -350,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
u32 freespace;
int ret;
 
-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
freespace -= client->wq_rsvd;
if (likely(freespace >= wqi_size)) {
@@ -360,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
client->no_wq_space++;
ret = -EAGAIN;
}
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 
return ret;
 }
@@ -372,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
*request)
 
GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
 
-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
client->wq_rsvd -= wqi_size;
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -534,10 +534,87 @@ static void __i915_guc_submit(struct drm_i915_gem_request 
*rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-   i915_gem_request_submit(rq);
+   __i915_gem_request_submit(rq);
__i915_guc_submit(rq);
 }
 
+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+   if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+>fence.flags))
+   return;
+
+   GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >fence.flags));
+
+   spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
+   intel_engine_enable_signaling(rq);
+   spin_unlock(>lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *last = port[0].request;
+   unsigned long flags;
+   struct rb_node *rb;
+   bool submit = false;
+
+   spin_lock_irqsave(>timeline->lock, flags);
+   rb = engine->execlist_first;
+   while (rb) {
+   struct drm_i915_gem_request *cursor =
+   rb_entry(rb, typeof(*cursor), priotree.node);
+
+   if (last && cursor->ctx != last->ctx) {
+   if (port != engine->execlist_port)
+   break;
+
+   i915_gem_request_assign(>request, last);
+   nested_enable_signaling(last);
+   port++;
+   }
+
+   rb = rb_next(rb);
+   rb_erase(>priotree.node, >execlist_queue);
+   RB_CLEAR_NODE(>priotree.node);
+   cursor->priotree.priority = INT_MAX;
+
+   i915_guc_submit(cursor);
+   last = cursor;
+   submit = true;
+   }
+   if (submit) {
+   i915_gem_request_assign(>request, last);
+   nested_enable_signaling(last);
+   engine->execlist_first = rb;
+   }
+   spin_unlock_irqrestore(>timeline->lock, flags);
+
+   return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+   struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *rq;
+   bool submit;
+
+   do {
+   rq = port[0].request;
+   while (rq && i915_gem_request_completed(rq)) {
+   i915_gem_request_put(rq);
+   rq = port[1].request;
+