Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

2016-11-04 Thread Chris Wilson
On Thu, Nov 03, 2016 at 07:47:39PM +, Chris Wilson wrote:
> On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote:
> > >+static void update_priorities(struct i915_priotree *pt, int prio)
> > >+{
> > >+  struct drm_i915_gem_request *request =
> > >+  container_of(pt, struct drm_i915_gem_request, priotree);
> > >+  struct intel_engine_cs *engine = request->engine;
> > >+  struct i915_dependency *dep;
> > >+
> > >+  if (prio <= READ_ONCE(pt->priority))
> > >+  return;
> > >+
> > >+  /* Recursively bump all dependent priorities to match the new request */
> > >+  list_for_each_entry(dep, >pre_list, pre_link)
> > >+  update_priorities(dep->signal, prio);
> > 
> > John got in trouble from recursion in his scheduler, used for the
> > same thing AFAIR. Or was it the priority bumping? Either way, it
> > could be imperative to avoid it.

Spent some time tuning (but not very well) for very deep pipelines:

static struct intel_engine_cs *
pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
{
struct intel_engine_cs *engine;

engine = container_of(pt,
  struct drm_i915_gem_request,
  priotree)->engine;
if (engine != locked) {
if (locked)
spin_unlock_irq(>timeline->lock);
spin_lock_irq(>timeline->lock);
}

return engine;
}

static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
{
struct intel_engine_cs *engine = NULL;
struct i915_dependency *dep, *p;
struct i915_dependency stack;
LIST_HEAD(dfs);

if (prio <= READ_ONCE(request->priotree.priority))
return;

/* Need BKL in order to use the temporary link inside i915_dependency */
lockdep_assert_held(>i915->drm.struct_mutex);

stack.signal = >priotree;
list_add(_link, );

/* Recursively bump all dependent priorities to match the new request */
list_for_each_entry_safe(dep, p, , dfs_link) {
struct i915_priotree *pt = dep->signal;

list_for_each_entry(p, >pre_list, pre_link)
if (prio > READ_ONCE(p->signal->priority))
list_move_tail(>dfs_link, );

p = list_first_entry(>dfs_link, typeof(*p), dfs_link);
if (!RB_EMPTY_NODE(>node))
continue;

engine = pt_lock_engine(pt, engine);

if (prio > pt->priority && RB_EMPTY_NODE(>node)) {
pt->priority = prio;
list_del_init(>dfs_link);
}
}

/* Fifo and depth-first replacement ensure our deps execute before us */
list_for_each_entry_safe_reverse(dep, p, , dfs_link) {
struct i915_priotree *pt = dep->signal;

INIT_LIST_HEAD(>dfs_link);

engine = pt_lock_engine(pt, engine);

if (prio <= pt->priority)
continue;

GEM_BUG_ON(RB_EMPTY_NODE(>node));

pt->priority = prio;
rb_erase(>node, >execlist_queue);
if (insert_request(pt, >execlist_queue))
engine->execlist_first = >node;
}

if (engine)
spin_unlock_irq(>timeline->lock);

/* XXX Do we need to preempt to make room for us and our deps? */
}

But as always any linear list scales poorly. It is just fortunate that
typically we don't see 10,000s of requests in the pipeline that need PI.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

2016-11-03 Thread Chris Wilson
On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote:
> >+static void update_priorities(struct i915_priotree *pt, int prio)
> >+{
> >+struct drm_i915_gem_request *request =
> >+container_of(pt, struct drm_i915_gem_request, priotree);
> >+struct intel_engine_cs *engine = request->engine;
> >+struct i915_dependency *dep;
> >+
> >+if (prio <= READ_ONCE(pt->priority))
> >+return;
> >+
> >+/* Recursively bump all dependent priorities to match the new request */
> >+list_for_each_entry(dep, >pre_list, pre_link)
> >+update_priorities(dep->signal, prio);
> 
> John got in trouble from recursion in his scheduler, used for the
> same thing AFAIR. Or was it the priority bumping? Either way, it
> could be imperative to avoid it.

static void update_priority(struct i915_priotree *pt, int prio)
{
struct drm_i915_gem_request *request =
container_of(pt, struct drm_i915_gem_request, priotree);
struct intel_engine_cs *engine = request->engine;

spin_lock_irq(>timeline->lock);
if (prio > pt->priority) {
pt->priority = prio;
if (!RB_EMPTY_NODE(>node)) {
rb_erase(>node, >execlist_queue);
if (insert_request(pt, >execlist_queue))
engine->execlist_first = >node;
}
}
spin_unlock_irq(>timeline->lock);
}

static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
{
struct i915_dependency *dep, *p;
LIST_HEAD(dfs);

if (prio <= READ_ONCE(request->priotree.priority))
return;

/* Need BKL in order to use the temporary link inside i915_dependency */
lockdep_assert_held(>i915->drm.struct_mutex);

/* Recursively bump all dependent priorities to match the new request */
list_for_each_entry(dep, >priotree.pre_list, pre_link)
if (prio > READ_ONCE(dep->signal->priority))
list_add_tail(>dfs_link, );

list_for_each_entry(dep, , dfs_link) {
list_for_each_entry(p, >signal->pre_list, pre_link) {
GEM_BUG_ON(p == dep);
if (prio > READ_ONCE(p->signal->priority))
list_move_tail(>dfs_link, );
}
}

/* Fifo and depth-first replacement ensure our deps execute before us */
list_for_each_entry_safe_reverse(dep, p, , dfs_link) {
update_priority(dep->signal, prio);
INIT_LIST_HEAD(>dfs_link);
}
update_priority(>priotree, prio);

/* XXX Do we need to preempt to make room for us and our deps? */
}

Still ugh. I think that we don't chase beyond the priorities we need to
bump is its only saving grace.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

2016-11-03 Thread Chris Wilson
On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote:
> >+rb = rb_next(rb);
> >+rb_erase(>priotree.node, >execlist_queue);
> >+RB_CLEAR_NODE(>priotree.node);
> >+cursor->priotree.priority = INT_MAX;
> >+
> >+/* We keep the previous context alive until we retire the
> >+ * following request. This ensures that any the context object
> >+ * is still pinned for any residual writes the HW makes into
> >+ * it on the context switch into the next object following the
> >+ * breadcrumb. Otherwise, we may retire the context too early.
> >+ */
> >+cursor->previous_context = engine->last_context;
> >+engine->last_context = cursor->ctx;
> >+
> 
> Will we will need a knob to control the amount of context merging?
> Until preemption that is, similar to GuC queue depth "nerfing" from
> the other patch.

Right, timeslicing + preemption is the perhaps the best answer here. A
knob here would be one of those policy decisions that we want to avoid
:) Not quite sure how we would go about autotuning it. A GL workload
looks quite different to a VK workload and quite different again to a CL
workload. One can only imagine the horrors on the transcode side :)

Making those decisions is the real heart of a scheduler.

This code should be more like:

while (req = scheduler.next_request()) {
if (!is_compatible(port, req)) {
if (port++)
break;
}

finish_request(req);
scheduler.remove_request(req);

port = req;
}

Given that at least this block of code is duplicated for the guc, maybe
it is worth starting. I was trying to avoid doing that because in my
mind it is the preemption that is the complex part and should drive the
next steps.

> >+static void update_priorities(struct i915_priotree *pt, int prio)
> >+{
> >+struct drm_i915_gem_request *request =
> >+container_of(pt, struct drm_i915_gem_request, priotree);
> >+struct intel_engine_cs *engine = request->engine;
> >+struct i915_dependency *dep;
> >+
> >+if (prio <= READ_ONCE(pt->priority))
> >+return;
> >+
> >+/* Recursively bump all dependent priorities to match the new request */
> >+list_for_each_entry(dep, >pre_list, pre_link)
> >+update_priorities(dep->signal, prio);
> 
> John got in trouble from recursion in his scheduler, used for the
> same thing AFAIR. Or was it the priority bumping? Either way, it
> could be imperative to avoid it.

Yup. I haven't thrown a 100,000 requests at it for this very reason.

Hmm, if I put an additional struct list_head into i915_dependency, we
can build a search list using it (praise be to the BKL).
 
> >+/* Fifo and depth-first replacement ensure our deps execute before us */
> >+spin_lock_irq(>timeline->lock);
> >+pt->priority = prio;
> >+if (!RB_EMPTY_NODE(>node)) {
> >+rb_erase(>node, >execlist_queue);
> >+if (insert_request(pt, >execlist_queue))
> >+engine->execlist_first = >node;
> >+}
> >+spin_unlock_irq(>timeline->lock);
> >+}
> >+
> >+static void execlists_schedule(struct drm_i915_gem_request *request, int 
> >prio)
> >+{
> >+/* Make sure that our entire dependency chain shares our prio */
> >+update_priorities(>priotree, prio);
> >+
> >+/* XXX Do we need to preempt to make room for us and our deps? */
> >+}
> 
> Hm, why isn't scheduling backend agnostic? Makes it much simpler to
> do the first implementation like this?

The scheduling is? The scheduling is just the policy. But we do need
entry points to call into the scheduler, such as change in the request
tree (here), completion of a request (context-switch interrupt) or
completion of a time slice. engine->schedule() is not the best name,
perhaps schedule_request(). Using engine is mostly a placeholder (a
convenient place to store a vfunc), but I do think we will end up with
different interactions with the scheduler based on the backend (in
particular feeding the GuC with dependency information). So, yes, this
is the easy route that should evolve as the need changes, and just
pencil the different callbacks as entry points where the backend and
scheduler should collude.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

2016-11-03 Thread Tvrtko Ursulin


Hi,

1st pass comments only for now.

On 02/11/2016 17:50, Chris Wilson wrote:

Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_debugfs.c|   7 +-
 drivers/gpu/drm/i915/i915_gem.c|   3 +-
 drivers/gpu/drm/i915/i915_gem_request.c|   4 ++
 drivers/gpu/drm/i915/i915_gem_request.h|   6 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
 drivers/gpu/drm/i915/intel_engine_cs.c |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c   | 104 -
 drivers/gpu/drm/i915/intel_ringbuffer.h|   3 +-
 8 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
  struct drm_i915_gem_request *rq,
  const char *prefix)
 {
-   seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+   seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,


Noticed this is quite unreadable due the range of INT_MIN to INT_MAX. Do 
we need such granularity or could use something smaller so it looks 
nicer here? I know, the argument is quite poor. :)



   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+  rq->priotree.priority,
   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
   rq->timeline->common->name);
 }
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)

if (i915.enable_execlists) {
u32 ptr, read, write;
+   struct rb_node *rb;

seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
rcu_read_unlock();

spin_lock_irq(>timeline->lock);
-   list_for_each_entry(rq, >execlist_queue, 
execlist_link) {
+   for (rb = engine->execlist_first; rb; rb = rb_next(rb)) 
{
+   rq = rb_entry(rb, typeof(*rq), priotree.node);
print_request(m, rq, "\t\tQ ");
}
spin_unlock_irq(>timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cf28666021f8..4697848ecfd9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct 
intel_engine_cs *engine)

if (i915.enable_execlists) {
spin_lock_irq(>timeline->lock);
-   INIT_LIST_HEAD(>execlist_queue);
i915_gem_request_put(engine->execlist_port[0].request);
i915_gem_request_put(engine->execlist_port[1].request);
memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+   engine->execlist_queue = RB_ROOT;
+   engine->execlist_first = NULL;
spin_unlock_irq(>timeline->lock);
}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 13090f226203..5f0068fac3e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
 {
struct i915_dependency *dep, *next;

+   GEM_BUG_ON(!RB_EMPTY_NODE(>node));
+
/* Everyone we depended upon should retire before us and remove
 * themselves from our list. However, retirement is run independently
 * on each 

[Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities

2016-11-02 Thread Chris Wilson
Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_debugfs.c|   7 +-
 drivers/gpu/drm/i915/i915_gem.c|   3 +-
 drivers/gpu/drm/i915/i915_gem_request.c|   4 ++
 drivers/gpu/drm/i915/i915_gem_request.h|   6 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
 drivers/gpu/drm/i915/intel_engine_cs.c |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c   | 104 -
 drivers/gpu/drm/i915/intel_ringbuffer.h|   3 +-
 8 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
  struct drm_i915_gem_request *rq,
  const char *prefix)
 {
-   seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+   seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+  rq->priotree.priority,
   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
   rq->timeline->common->name);
 }
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
 
if (i915.enable_execlists) {
u32 ptr, read, write;
+   struct rb_node *rb;
 
seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
rcu_read_unlock();
 
spin_lock_irq(>timeline->lock);
-   list_for_each_entry(rq, >execlist_queue, 
execlist_link) {
+   for (rb = engine->execlist_first; rb; rb = rb_next(rb)) 
{
+   rq = rb_entry(rb, typeof(*rq), priotree.node);
print_request(m, rq, "\t\tQ ");
}
spin_unlock_irq(>timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cf28666021f8..4697848ecfd9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct 
intel_engine_cs *engine)
 
if (i915.enable_execlists) {
spin_lock_irq(>timeline->lock);
-   INIT_LIST_HEAD(>execlist_queue);
i915_gem_request_put(engine->execlist_port[0].request);
i915_gem_request_put(engine->execlist_port[1].request);
memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+   engine->execlist_queue = RB_ROOT;
+   engine->execlist_first = NULL;
spin_unlock_irq(>timeline->lock);
}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 13090f226203..5f0068fac3e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
 {
struct i915_dependency *dep, *next;
 
+   GEM_BUG_ON(!RB_EMPTY_NODE(>node));
+
/* Everyone we depended upon should retire before us and remove
 * themselves from our list. However, retirement is run independently
 * on each timeline and so we may be called out-of-order.
@@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt)
 {
INIT_LIST_HEAD(>pre_list);
INIT_LIST_HEAD(>post_list);
+   RB_CLEAR_NODE(>node);
+   pt->priority = INT_MIN;
 }
 
 void