Re: [Intel-gfx] [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc

2016-11-03 Thread Tvrtko Ursulin


On 02/11/2016 17:50, Chris Wilson wrote:

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).


How big is the performance hit? :)

Regards,

Tvrtko


---
 drivers/gpu/drm/i915/i915_guc_submission.c | 83 ++
 drivers/gpu/drm/i915/i915_irq.c|  4 +-
 drivers/gpu/drm/i915/intel_lrc.c   |  3 --
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index bab0c2fc3bce..601b8777d3fd 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
u32 freespace;
int ret;

-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
freespace -= gc->wq_rsvd;
if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
gc->no_wq_space++;
ret = -EAGAIN;
}
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);

return ret;
 }
@@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
*request)

GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);

-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
gc->wq_rsvd -= wqi_size;
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 }

 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -658,6 +658,70 @@ static void i915_guc_submit(struct drm_i915_gem_request 
*rq)
spin_unlock(>wq_lock);
 }

+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *last = port[0].request;
+   unsigned long flags;
+   struct rb_node *rb;
+   bool submit = false;
+
+   spin_lock_irqsave(>timeline->lock, flags);
+   rb = engine->execlist_first;
+   while (rb) {
+   struct drm_i915_gem_request *cursor =
+   rb_entry(rb, typeof(*cursor), priotree.node);
+
+   if (last && cursor->ctx != last->ctx) {
+   if (port != engine->execlist_port)
+   break;
+
+   i915_gem_request_assign(>request, last);
+   dma_fence_enable_sw_signaling(>fence);
+   port++;
+   }
+
+   rb = rb_next(rb);
+   rb_erase(>priotree.node, >execlist_queue);
+   RB_CLEAR_NODE(>priotree.node);
+   cursor->priotree.priority = INT_MAX;
+
+   i915_guc_submit(cursor);
+   last = cursor;
+   submit = true;
+   }
+   if (submit) {
+   i915_gem_request_assign(>request, last);
+   dma_fence_enable_sw_signaling(>fence);
+   engine->execlist_first = rb;
+   }
+   spin_unlock_irqrestore(>timeline->lock, flags);
+
+   return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+   struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *rq;
+   bool submit;
+
+   do {
+   rq = port[0].request;
+   while (rq && i915_gem_request_completed(rq)) {
+   i915_gem_request_put(rq);
+   rq = port[1].request;
+   port[0].request = rq;
+   port[1].request = NULL;
+   }
+
+   submit = false;
+   if (!port[1].request)
+   submit = i915_guc_dequeue(engine);
+   } while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1524,16 +1588,13 @@ int i915_guc_submission_enable(struct drm_i915_private 
*dev_priv)

/* Take over from manual control of ELSP (execlists) */
for_each_engine(engine, dev_priv, id) {
-   engine->submit_request = i915_guc_submit;
-   engine->schedule = NULL;
+   tasklet_init(>irq_tasklet,
+i915_guc_irq_handler,
+(unsigned long)engine);

/* Replay the current set of previously submitted requests */
-   list_for_each_entry(request,
-   >timeline->requests, link) {
+   

[Intel-gfx] [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc

2016-11-02 Thread Chris Wilson
This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 83 ++
 drivers/gpu/drm/i915/i915_irq.c|  4 +-
 drivers/gpu/drm/i915/intel_lrc.c   |  3 --
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index bab0c2fc3bce..601b8777d3fd 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
u32 freespace;
int ret;
 
-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
freespace -= gc->wq_rsvd;
if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request 
*request)
gc->no_wq_space++;
ret = -EAGAIN;
}
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 
return ret;
 }
@@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request 
*request)
 
GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
 
-   spin_lock(>wq_lock);
+   spin_lock_irq(>wq_lock);
gc->wq_rsvd -= wqi_size;
-   spin_unlock(>wq_lock);
+   spin_unlock_irq(>wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -658,6 +658,70 @@ static void i915_guc_submit(struct drm_i915_gem_request 
*rq)
spin_unlock(>wq_lock);
 }
 
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *last = port[0].request;
+   unsigned long flags;
+   struct rb_node *rb;
+   bool submit = false;
+
+   spin_lock_irqsave(>timeline->lock, flags);
+   rb = engine->execlist_first;
+   while (rb) {
+   struct drm_i915_gem_request *cursor =
+   rb_entry(rb, typeof(*cursor), priotree.node);
+
+   if (last && cursor->ctx != last->ctx) {
+   if (port != engine->execlist_port)
+   break;
+
+   i915_gem_request_assign(>request, last);
+   dma_fence_enable_sw_signaling(>fence);
+   port++;
+   }
+
+   rb = rb_next(rb);
+   rb_erase(>priotree.node, >execlist_queue);
+   RB_CLEAR_NODE(>priotree.node);
+   cursor->priotree.priority = INT_MAX;
+
+   i915_guc_submit(cursor);
+   last = cursor;
+   submit = true;
+   }
+   if (submit) {
+   i915_gem_request_assign(>request, last);
+   dma_fence_enable_sw_signaling(>fence);
+   engine->execlist_first = rb;
+   }
+   spin_unlock_irqrestore(>timeline->lock, flags);
+
+   return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+   struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+   struct execlist_port *port = engine->execlist_port;
+   struct drm_i915_gem_request *rq;
+   bool submit;
+
+   do {
+   rq = port[0].request;
+   while (rq && i915_gem_request_completed(rq)) {
+   i915_gem_request_put(rq);
+   rq = port[1].request;
+   port[0].request = rq;
+   port[1].request = NULL;
+   }
+
+   submit = false;
+   if (!port[1].request)
+   submit = i915_guc_dequeue(engine);
+   } while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1524,16 +1588,13 @@ int i915_guc_submission_enable(struct drm_i915_private 
*dev_priv)
 
/* Take over from manual control of ELSP (execlists) */
for_each_engine(engine, dev_priv, id) {
-   engine->submit_request = i915_guc_submit;
-   engine->schedule = NULL;
+   tasklet_init(>irq_tasklet,
+i915_guc_irq_handler,
+(unsigned long)engine);
 
/* Replay the current set of previously submitted requests */
-   list_for_each_entry(request,
-   >timeline->requests, link) {
+   list_for_each_entry(request, >timeline->requests, link)
client->wq_rsvd += sizeof(struct