Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
On Thu, Nov 03, 2016 at 07:47:39PM +, Chris Wilson wrote: > On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote: > > >+static void update_priorities(struct i915_priotree *pt, int prio) > > >+{ > > >+ struct drm_i915_gem_request *request = > > >+ container_of(pt, struct drm_i915_gem_request, priotree); > > >+ struct intel_engine_cs *engine = request->engine; > > >+ struct i915_dependency *dep; > > >+ > > >+ if (prio <= READ_ONCE(pt->priority)) > > >+ return; > > >+ > > >+ /* Recursively bump all dependent priorities to match the new request */ > > >+ list_for_each_entry(dep, >pre_list, pre_link) > > >+ update_priorities(dep->signal, prio); > > > > John got in trouble from recursion in his scheduler, used for the > > same thing AFAIR. Or was it the priority bumping? Either way, it > > could be imperative to avoid it. Spent some time tuning (but not very well) for very deep pipelines: static struct intel_engine_cs * pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) { struct intel_engine_cs *engine; engine = container_of(pt, struct drm_i915_gem_request, priotree)->engine; if (engine != locked) { if (locked) spin_unlock_irq(>timeline->lock); spin_lock_irq(>timeline->lock); } return engine; } static void execlists_schedule(struct drm_i915_gem_request *request, int prio) { struct intel_engine_cs *engine = NULL; struct i915_dependency *dep, *p; struct i915_dependency stack; LIST_HEAD(dfs); if (prio <= READ_ONCE(request->priotree.priority)) return; /* Need BKL in order to use the temporary link inside i915_dependency */ lockdep_assert_held(>i915->drm.struct_mutex); stack.signal = >priotree; list_add(_link, ); /* Recursively bump all dependent priorities to match the new request */ list_for_each_entry_safe(dep, p, , dfs_link) { struct i915_priotree *pt = dep->signal; list_for_each_entry(p, >pre_list, pre_link) if (prio > READ_ONCE(p->signal->priority)) list_move_tail(>dfs_link, ); p = list_first_entry(>dfs_link, typeof(*p), dfs_link); if (!RB_EMPTY_NODE(>node)) continue; engine = pt_lock_engine(pt, engine); if (prio > pt->priority && RB_EMPTY_NODE(>node)) { pt->priority = prio; list_del_init(>dfs_link); } } /* Fifo and depth-first replacement ensure our deps execute before us */ list_for_each_entry_safe_reverse(dep, p, , dfs_link) { struct i915_priotree *pt = dep->signal; INIT_LIST_HEAD(>dfs_link); engine = pt_lock_engine(pt, engine); if (prio <= pt->priority) continue; GEM_BUG_ON(RB_EMPTY_NODE(>node)); pt->priority = prio; rb_erase(>node, >execlist_queue); if (insert_request(pt, >execlist_queue)) engine->execlist_first = >node; } if (engine) spin_unlock_irq(>timeline->lock); /* XXX Do we need to preempt to make room for us and our deps? */ } But as always any linear list scales poorly. It is just fortunate that typically we don't see 10,000s of requests in the pipeline that need PI. -Chris -- Chris Wilson, Intel Open Source Technology Centre ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote: > >+static void update_priorities(struct i915_priotree *pt, int prio) > >+{ > >+struct drm_i915_gem_request *request = > >+container_of(pt, struct drm_i915_gem_request, priotree); > >+struct intel_engine_cs *engine = request->engine; > >+struct i915_dependency *dep; > >+ > >+if (prio <= READ_ONCE(pt->priority)) > >+return; > >+ > >+/* Recursively bump all dependent priorities to match the new request */ > >+list_for_each_entry(dep, >pre_list, pre_link) > >+update_priorities(dep->signal, prio); > > John got in trouble from recursion in his scheduler, used for the > same thing AFAIR. Or was it the priority bumping? Either way, it > could be imperative to avoid it. static void update_priority(struct i915_priotree *pt, int prio) { struct drm_i915_gem_request *request = container_of(pt, struct drm_i915_gem_request, priotree); struct intel_engine_cs *engine = request->engine; spin_lock_irq(>timeline->lock); if (prio > pt->priority) { pt->priority = prio; if (!RB_EMPTY_NODE(>node)) { rb_erase(>node, >execlist_queue); if (insert_request(pt, >execlist_queue)) engine->execlist_first = >node; } } spin_unlock_irq(>timeline->lock); } static void execlists_schedule(struct drm_i915_gem_request *request, int prio) { struct i915_dependency *dep, *p; LIST_HEAD(dfs); if (prio <= READ_ONCE(request->priotree.priority)) return; /* Need BKL in order to use the temporary link inside i915_dependency */ lockdep_assert_held(>i915->drm.struct_mutex); /* Recursively bump all dependent priorities to match the new request */ list_for_each_entry(dep, >priotree.pre_list, pre_link) if (prio > READ_ONCE(dep->signal->priority)) list_add_tail(>dfs_link, ); list_for_each_entry(dep, , dfs_link) { list_for_each_entry(p, >signal->pre_list, pre_link) { GEM_BUG_ON(p == dep); if (prio > READ_ONCE(p->signal->priority)) list_move_tail(>dfs_link, ); } } /* Fifo and depth-first replacement ensure our deps execute before us */ list_for_each_entry_safe_reverse(dep, p, , dfs_link) { update_priority(dep->signal, prio); INIT_LIST_HEAD(>dfs_link); } update_priority(>priotree, prio); /* XXX Do we need to preempt to make room for us and our deps? */ } Still ugh. I think that we don't chase beyond the priorities we need to bump is its only saving grace. -Chris -- Chris Wilson, Intel Open Source Technology Centre ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
On Thu, Nov 03, 2016 at 04:21:25PM +, Tvrtko Ursulin wrote: > >+rb = rb_next(rb); > >+rb_erase(>priotree.node, >execlist_queue); > >+RB_CLEAR_NODE(>priotree.node); > >+cursor->priotree.priority = INT_MAX; > >+ > >+/* We keep the previous context alive until we retire the > >+ * following request. This ensures that any the context object > >+ * is still pinned for any residual writes the HW makes into > >+ * it on the context switch into the next object following the > >+ * breadcrumb. Otherwise, we may retire the context too early. > >+ */ > >+cursor->previous_context = engine->last_context; > >+engine->last_context = cursor->ctx; > >+ > > Will we will need a knob to control the amount of context merging? > Until preemption that is, similar to GuC queue depth "nerfing" from > the other patch. Right, timeslicing + preemption is the perhaps the best answer here. A knob here would be one of those policy decisions that we want to avoid :) Not quite sure how we would go about autotuning it. A GL workload looks quite different to a VK workload and quite different again to a CL workload. One can only imagine the horrors on the transcode side :) Making those decisions is the real heart of a scheduler. This code should be more like: while (req = scheduler.next_request()) { if (!is_compatible(port, req)) { if (port++) break; } finish_request(req); scheduler.remove_request(req); port = req; } Given that at least this block of code is duplicated for the guc, maybe it is worth starting. I was trying to avoid doing that because in my mind it is the preemption that is the complex part and should drive the next steps. > >+static void update_priorities(struct i915_priotree *pt, int prio) > >+{ > >+struct drm_i915_gem_request *request = > >+container_of(pt, struct drm_i915_gem_request, priotree); > >+struct intel_engine_cs *engine = request->engine; > >+struct i915_dependency *dep; > >+ > >+if (prio <= READ_ONCE(pt->priority)) > >+return; > >+ > >+/* Recursively bump all dependent priorities to match the new request */ > >+list_for_each_entry(dep, >pre_list, pre_link) > >+update_priorities(dep->signal, prio); > > John got in trouble from recursion in his scheduler, used for the > same thing AFAIR. Or was it the priority bumping? Either way, it > could be imperative to avoid it. Yup. I haven't thrown a 100,000 requests at it for this very reason. Hmm, if I put an additional struct list_head into i915_dependency, we can build a search list using it (praise be to the BKL). > >+/* Fifo and depth-first replacement ensure our deps execute before us */ > >+spin_lock_irq(>timeline->lock); > >+pt->priority = prio; > >+if (!RB_EMPTY_NODE(>node)) { > >+rb_erase(>node, >execlist_queue); > >+if (insert_request(pt, >execlist_queue)) > >+engine->execlist_first = >node; > >+} > >+spin_unlock_irq(>timeline->lock); > >+} > >+ > >+static void execlists_schedule(struct drm_i915_gem_request *request, int > >prio) > >+{ > >+/* Make sure that our entire dependency chain shares our prio */ > >+update_priorities(>priotree, prio); > >+ > >+/* XXX Do we need to preempt to make room for us and our deps? */ > >+} > > Hm, why isn't scheduling backend agnostic? Makes it much simpler to > do the first implementation like this? The scheduling is? The scheduling is just the policy. But we do need entry points to call into the scheduler, such as change in the request tree (here), completion of a request (context-switch interrupt) or completion of a time slice. engine->schedule() is not the best name, perhaps schedule_request(). Using engine is mostly a placeholder (a convenient place to store a vfunc), but I do think we will end up with different interactions with the scheduler based on the backend (in particular feeding the GuC with dependency information). So, yes, this is the easy route that should evolve as the need changes, and just pencil the different callbacks as entry points where the backend and scheduler should collude. -Chris -- Chris Wilson, Intel Open Source Technology Centre ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
Hi, 1st pass comments only for now. On 02/11/2016 17:50, Chris Wilson wrote: Track the priority of each request and use it to determine the order in which we submit requests to the hardware via execlists. The priority of the request is determined by the user (eventually via the context) but may be overridden at any time by the driver. When we set the priority of the request, we bump the priority of all of its dependencies to match - so that a high priority drawing operation is not stuck behind a background task. When the request is ready to execute (i.e. we have signaled the submit fence following completion of all its dependencies, including third party fences), we put the request into a priority sorted rbtree to be submitted to the hardware. If the request is higher priority than all pending requests, it will be submitted on the next context-switch interrupt as soon as the hardware has completed the current request. We do not currently preempt any current execution to immediately run a very high priority request, at least not yet. One more limitation, is that this is first implementation is for execlists only so currently limited to gen8/gen9. Signed-off-by: Chris Wilson--- drivers/gpu/drm/i915/i915_debugfs.c| 7 +- drivers/gpu/drm/i915/i915_gem.c| 3 +- drivers/gpu/drm/i915/i915_gem_request.c| 4 ++ drivers/gpu/drm/i915/i915_gem_request.h| 6 +- drivers/gpu/drm/i915/i915_guc_submission.c | 4 ++ drivers/gpu/drm/i915/intel_engine_cs.c | 3 +- drivers/gpu/drm/i915/intel_lrc.c | 104 - drivers/gpu/drm/i915/intel_ringbuffer.h| 3 +- 8 files changed, 109 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3cb96d260dfb..dac435680e98 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m, struct drm_i915_gem_request *rq, const char *prefix) { - seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix, + seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix, Noticed this is quite unreadable due the range of INT_MIN to INT_MAX. Do we need such granularity or could use something smaller so it looks nicer here? I know, the argument is quite poor. :) rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno, + rq->priotree.priority, jiffies_to_msecs(jiffies - rq->emitted_jiffies), rq->timeline->common->name); } @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (i915.enable_execlists) { u32 ptr, read, write; + struct rb_node *rb; seq_printf(m, "\tExeclist status: 0x%08x %08x\n", I915_READ(RING_EXECLIST_STATUS_LO(engine)), @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused) rcu_read_unlock(); spin_lock_irq(>timeline->lock); - list_for_each_entry(rq, >execlist_queue, execlist_link) { + for (rb = engine->execlist_first; rb; rb = rb_next(rb)) { + rq = rb_entry(rb, typeof(*rq), priotree.node); print_request(m, rq, "\t\tQ "); } spin_unlock_irq(>timeline->lock); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index cf28666021f8..4697848ecfd9 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine) if (i915.enable_execlists) { spin_lock_irq(>timeline->lock); - INIT_LIST_HEAD(>execlist_queue); i915_gem_request_put(engine->execlist_port[0].request); i915_gem_request_put(engine->execlist_port[1].request); memset(engine->execlist_port, 0, sizeof(engine->execlist_port)); + engine->execlist_queue = RB_ROOT; + engine->execlist_first = NULL; spin_unlock_irq(>timeline->lock); } } diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 13090f226203..5f0068fac3e9 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt) { struct i915_dependency *dep, *next; + GEM_BUG_ON(!RB_EMPTY_NODE(>node)); + /* Everyone we depended upon should retire before us and remove * themselves from our list. However, retirement is run independently * on each
[Intel-gfx] [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
Track the priority of each request and use it to determine the order in which we submit requests to the hardware via execlists. The priority of the request is determined by the user (eventually via the context) but may be overridden at any time by the driver. When we set the priority of the request, we bump the priority of all of its dependencies to match - so that a high priority drawing operation is not stuck behind a background task. When the request is ready to execute (i.e. we have signaled the submit fence following completion of all its dependencies, including third party fences), we put the request into a priority sorted rbtree to be submitted to the hardware. If the request is higher priority than all pending requests, it will be submitted on the next context-switch interrupt as soon as the hardware has completed the current request. We do not currently preempt any current execution to immediately run a very high priority request, at least not yet. One more limitation, is that this is first implementation is for execlists only so currently limited to gen8/gen9. Signed-off-by: Chris Wilson--- drivers/gpu/drm/i915/i915_debugfs.c| 7 +- drivers/gpu/drm/i915/i915_gem.c| 3 +- drivers/gpu/drm/i915/i915_gem_request.c| 4 ++ drivers/gpu/drm/i915/i915_gem_request.h| 6 +- drivers/gpu/drm/i915/i915_guc_submission.c | 4 ++ drivers/gpu/drm/i915/intel_engine_cs.c | 3 +- drivers/gpu/drm/i915/intel_lrc.c | 104 - drivers/gpu/drm/i915/intel_ringbuffer.h| 3 +- 8 files changed, 109 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3cb96d260dfb..dac435680e98 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m, struct drm_i915_gem_request *rq, const char *prefix) { - seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix, + seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix, rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno, + rq->priotree.priority, jiffies_to_msecs(jiffies - rq->emitted_jiffies), rq->timeline->common->name); } @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) if (i915.enable_execlists) { u32 ptr, read, write; + struct rb_node *rb; seq_printf(m, "\tExeclist status: 0x%08x %08x\n", I915_READ(RING_EXECLIST_STATUS_LO(engine)), @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused) rcu_read_unlock(); spin_lock_irq(>timeline->lock); - list_for_each_entry(rq, >execlist_queue, execlist_link) { + for (rb = engine->execlist_first; rb; rb = rb_next(rb)) { + rq = rb_entry(rb, typeof(*rq), priotree.node); print_request(m, rq, "\t\tQ "); } spin_unlock_irq(>timeline->lock); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index cf28666021f8..4697848ecfd9 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine) if (i915.enable_execlists) { spin_lock_irq(>timeline->lock); - INIT_LIST_HEAD(>execlist_queue); i915_gem_request_put(engine->execlist_port[0].request); i915_gem_request_put(engine->execlist_port[1].request); memset(engine->execlist_port, 0, sizeof(engine->execlist_port)); + engine->execlist_queue = RB_ROOT; + engine->execlist_first = NULL; spin_unlock_irq(>timeline->lock); } } diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 13090f226203..5f0068fac3e9 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt) { struct i915_dependency *dep, *next; + GEM_BUG_ON(!RB_EMPTY_NODE(>node)); + /* Everyone we depended upon should retire before us and remove * themselves from our list. However, retirement is run independently * on each timeline and so we may be called out-of-order. @@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt) { INIT_LIST_HEAD(>pre_list); INIT_LIST_HEAD(>post_list); + RB_CLEAR_NODE(>node); + pt->priority = INT_MIN; } void