On Fri, 2025-10-24 at 17:07 +0100, Tvrtko Ursulin wrote:
> The FAIR scheduling policy is built upon the same concepts as the well
> known CFS CPU scheduler - entity run queue is sorted by the virtual GPU
> time consumed by entities in a way that the entity with least vruntime
> runs first.
>
> It is able to avoid total priority starvation, which is one of the
> problems with FIFO, and it also does not need for per priority run queues.
> As it scales the actual GPU runtime by an exponential factor as the
> priority decreases, the virtual runtime for low priority entities grows
> faster than for normal priority, pushing them further down the runqueue
> order for the same real GPU time spent.
>
> Apart from this fundamental fairness, fair policy is especially strong in
> oversubscription workloads where it is able to give more GPU time to short
> and bursty workloads when they are running in parallel with GPU heavy
> clients submitting deep job queues.
>
> Signed-off-by: Tvrtko Ursulin <[email protected]>
> Cc: Christian König <[email protected]>
> Cc: Danilo Krummrich <[email protected]>
> Cc: Matthew Brost <[email protected]>
> Cc: Philipp Stanner <[email protected]>
> Cc: Pierre-Eric Pelloux-Prayer <[email protected]>
> ---
> drivers/gpu/drm/scheduler/sched_entity.c | 28 ++--
> drivers/gpu/drm/scheduler/sched_internal.h | 5 +
> drivers/gpu/drm/scheduler/sched_main.c | 11 +-
> drivers/gpu/drm/scheduler/sched_rq.c | 182 ++++++++++++++++++++-
> include/drm/gpu_scheduler.h | 17 +-
> 5 files changed, 224 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
> b/drivers/gpu/drm/scheduler/sched_entity.c
> index 565eddebb667..4144a97702a5 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -107,6 +107,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
> entity->guilty = guilty;
> entity->num_sched_list = num_sched_list;
> entity->priority = priority;
> + entity->rq_priority = drm_sched_policy == DRM_SCHED_POLICY_FAIR ?
> + DRM_SCHED_PRIORITY_KERNEL : priority;
> /*
> * It's perfectly valid to initialize an entity without having a valid
> * scheduler attached. It's just not valid to use the scheduler before
> it
> @@ -123,17 +125,23 @@ int drm_sched_entity_init(struct drm_sched_entity
> *entity,
> */
> pr_warn("%s: called with uninitialized scheduler\n", __func__);
> } else if (num_sched_list) {
> - /* The "priority" of an entity cannot exceed the number of
> run-queues of a
> - * scheduler. Protect against num_rqs being 0, by converting to
> signed. Choose
> - * the lowest priority available.
> + enum drm_sched_priority p = entity->priority;
> +
> + /*
> + * The "priority" of an entity cannot exceed the number of
> + * run-queues of a scheduler. Protect against num_rqs being 0,
> + * by converting to signed. Choose the lowest priority
> + * available.
> */
> - if (entity->priority >= sched_list[0]->num_rqs) {
> - dev_err(sched_list[0]->dev, "entity has out-of-bounds
> priority: %u. num_rqs: %u\n",
> - entity->priority, sched_list[0]->num_rqs);
> - entity->priority = max_t(s32, (s32)
> sched_list[0]->num_rqs - 1,
> - (s32)
> DRM_SCHED_PRIORITY_KERNEL);
> + if (p >= sched_list[0]->num_user_rqs) {
> + dev_err(sched_list[0]->dev, "entity with out-of-bounds
> priority:%u num_user_rqs:%u\n",
> + p, sched_list[0]->num_user_rqs);
> + p = max_t(s32,
> + (s32)sched_list[0]->num_user_rqs - 1,
> + (s32)DRM_SCHED_PRIORITY_KERNEL);
> + entity->priority = p;
> }
> - entity->rq = sched_list[0]->sched_rq[entity->priority];
> + entity->rq = sched_list[0]->sched_rq[entity->rq_priority];
> }
>
> init_completion(&entity->entity_idle);
> @@ -566,7 +574,7 @@ void drm_sched_entity_select_rq(struct drm_sched_entity
> *entity)
>
> spin_lock(&entity->lock);
> sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
> - rq = sched ? sched->sched_rq[entity->priority] : NULL;
> + rq = sched ? sched->sched_rq[entity->rq_priority] : NULL;
> if (rq != entity->rq) {
> drm_sched_rq_remove_entity(entity->rq, entity);
> entity->rq = rq;
> diff --git a/drivers/gpu/drm/scheduler/sched_internal.h
> b/drivers/gpu/drm/scheduler/sched_internal.h
> index 9adad48ec084..593e380a2d59 100644
> --- a/drivers/gpu/drm/scheduler/sched_internal.h
> +++ b/drivers/gpu/drm/scheduler/sched_internal.h
> @@ -12,6 +12,8 @@
> * @kref: reference count for the object.
> * @lock: lock guarding the @runtime updates.
> * @runtime: time entity spent on the GPU.
> + * @prev_runtime: previous @runtime used to get the runtime delta.
> + * @vruntime: virtual runtime as accumulated by the fair algorithm.
> *
> * Because jobs and entities have decoupled lifetimes, ie. we cannot access
> the
> * entity once the job is completed and we know how much time it took on the
> @@ -22,6 +24,8 @@ struct drm_sched_entity_stats {
> struct kref kref;
> spinlock_t lock;
> ktime_t runtime;
> + ktime_t prev_runtime;
> + ktime_t vruntime;
> };
>
> /* Used to choose between FIFO and RR job-scheduling */
> @@ -29,6 +33,7 @@ extern int drm_sched_policy;
>
> #define DRM_SCHED_POLICY_RR 0
> #define DRM_SCHED_POLICY_FIFO 1
> +#define DRM_SCHED_POLICY_FAIR 2
>
> bool drm_sched_can_queue(struct drm_gpu_scheduler *sched,
> struct drm_sched_entity *entity);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 0c5f7a0594bf..261886b1e18f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -90,7 +90,7 @@ int drm_sched_policy = DRM_SCHED_POLICY_FIFO;
> * DOC: sched_policy (int)
> * Used to override default entities scheduling policy in a run queue.
> */
> -MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities
> on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, "
> __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default).");
> +MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities
> on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, "
> __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default), "
> __stringify(DRM_SCHED_POLICY_FAIR) " = Fair.");
> module_param_named(sched_policy, drm_sched_policy, int, 0444);
>
> static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched)
> @@ -1133,11 +1133,14 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> const struct drm_sched_init_
> sched->own_submit_wq = true;
> }
>
> - sched->sched_rq = kmalloc_array(args->num_rqs, sizeof(*sched->sched_rq),
> + sched->num_user_rqs = args->num_rqs;
> + sched->num_rqs = drm_sched_policy != DRM_SCHED_POLICY_FAIR ?
> + args->num_rqs : 1;
> + sched->sched_rq = kmalloc_array(sched->num_rqs,
> sizeof(*sched->sched_rq),
> GFP_KERNEL | __GFP_ZERO);
> if (!sched->sched_rq)
> goto Out_check_own;
> - sched->num_rqs = args->num_rqs;
> +
> for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> sched->sched_rq[i] = kzalloc(sizeof(*sched->sched_rq[i]),
> GFP_KERNEL);
> if (!sched->sched_rq[i])
> @@ -1279,7 +1282,7 @@ void drm_sched_increase_karma(struct drm_sched_job *bad)
> if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
> atomic_inc(&bad->karma);
>
> - for (i = DRM_SCHED_PRIORITY_HIGH; i < sched->num_rqs; i++) {
> + for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> struct drm_sched_rq *rq = sched->sched_rq[i];
>
> spin_lock(&rq->lock);
> diff --git a/drivers/gpu/drm/scheduler/sched_rq.c
> b/drivers/gpu/drm/scheduler/sched_rq.c
> index 2d1f579d8352..5df10f9cbb7f 100644
> --- a/drivers/gpu/drm/scheduler/sched_rq.c
> +++ b/drivers/gpu/drm/scheduler/sched_rq.c
> @@ -16,6 +16,35 @@ drm_sched_entity_compare_before(struct rb_node *a, const
> struct rb_node *b)
> return ktime_before(ea->oldest_job_waiting, eb->oldest_job_waiting);
> }
>
> +static void drm_sched_rq_update_prio(struct drm_sched_rq *rq)
> +{
> + enum drm_sched_priority prio = DRM_SCHED_PRIORITY_INVALID;
> + struct rb_node *rb;
> +
> + lockdep_assert_held(&rq->lock);
> +
> + rb = rb_first_cached(&rq->rb_tree_root);
> + if (rb) {
> + struct drm_sched_entity *entity =
> + rb_entry(rb, typeof(*entity), rb_tree_node);
> +
> + /*
> + * The normal locking order is entity then run-queue so taking
> + * the entity lock here would be a locking inversion for the
> + * case when the current head of the run-queue is different from
> + * the one we already have locked. The unlocked read is fine
> + * though, because if the priority had just changed it is no big
> + * deal for our algorithm, but just a transient reachable only
> + * by drivers with userspace dynamic priority changes API. Equal
> + * in effect to the priority change becoming visible a few
> + * instructions later.
> + */
> + prio = READ_ONCE(entity->priority);
I want to take a deep look into the entire locking situation, too.
Unfortunately I currently have to complete another important mile stone
and can only dive into that by ~mid November.
> + }
> +
> + rq->head_prio = prio;
> +}
> +
> static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
> struct drm_sched_rq *rq)
> {
> @@ -25,6 +54,7 @@ static void drm_sched_rq_remove_fifo_locked(struct
> drm_sched_entity *entity,
> if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
> rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
> RB_CLEAR_NODE(&entity->rb_tree_node);
> + drm_sched_rq_update_prio(rq);
> }
> }
>
> @@ -46,6 +76,7 @@ static void drm_sched_rq_update_fifo_locked(struct
> drm_sched_entity *entity,
>
> rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
> drm_sched_entity_compare_before);
> + drm_sched_rq_update_prio(rq);
> }
>
> /**
> @@ -62,6 +93,138 @@ void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
> INIT_LIST_HEAD(&rq->entities);
> rq->rb_tree_root = RB_ROOT_CACHED;
> rq->sched = sched;
> + rq->head_prio = DRM_SCHED_PRIORITY_INVALID;
> +}
> +
> +static ktime_t
> +drm_sched_rq_get_min_vruntime(struct drm_sched_rq *rq)
> +{
> + ktime_t vruntime = 0;
> + struct rb_node *rb;
> +
> + lockdep_assert_held(&rq->lock);
> +
> + for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
> + struct drm_sched_entity *entity =
> + rb_entry(rb, typeof(*entity), rb_tree_node);
> + struct drm_sched_entity_stats *stats = entity->stats;
> +
> + /*
> + * We only need the spin lock here on platforms where access to
> + * 64-bit ktime_t can tear but for simplicity we take it un-
> + * conditionally.
> + */
> + spin_lock(&stats->lock);
> + vruntime = stats->vruntime;
> + spin_unlock(&stats->lock);
That code here is plainly very correct and locking is the right thing
to do; so the comment above is not necessary.
> + }
> +
> + return vruntime;
> +}
> +
> +static void
> +drm_sched_entity_save_vruntime(struct drm_sched_entity *entity,
> + ktime_t min_vruntime)
> +{
> + struct drm_sched_entity_stats *stats = entity->stats;
> + ktime_t vruntime;
> +
> + spin_lock(&stats->lock);
> + vruntime = stats->vruntime;
> + if (min_vruntime && vruntime > min_vruntime)
> + vruntime = ktime_sub(vruntime, min_vruntime);
> + else
> + vruntime = 0;
> + stats->vruntime = vruntime;
> + spin_unlock(&stats->lock);
> +}
> +
> +static ktime_t
> +drm_sched_entity_restore_vruntime(struct drm_sched_entity *entity,
> + ktime_t min_vruntime,
> + enum drm_sched_priority rq_prio)
> +{
> + struct drm_sched_entity_stats *stats = entity->stats;
> + enum drm_sched_priority prio = entity->priority;
> + ktime_t vruntime;
> +
> + BUILD_BUG_ON(DRM_SCHED_PRIORITY_NORMAL < DRM_SCHED_PRIORITY_HIGH);
> +
> + spin_lock(&stats->lock);
> + vruntime = stats->vruntime;
> +
> + /*
> + * Special handling for entities which were picked from the top of the
> + * queue and are now re-joining the top with another one already there.
> + */
> + if (!vruntime && min_vruntime) {
> + if (prio > rq_prio) {
> + /*
> + * Lower priority should not overtake higher when re-
> + * joining at the top of the queue.
> + */
> + vruntime = us_to_ktime(prio - rq_prio);
> + } else if (prio < rq_prio) {
> + /*
> + * Higher priority can go first.
> + */
> + vruntime = -us_to_ktime(rq_prio - prio);
> + }
> + }
> +
> + /*
> + * Restore saved relative position in the queue.
> + */
> + vruntime = ktime_add(min_vruntime, vruntime);
> +
> + stats->vruntime = vruntime;
> + spin_unlock(&stats->lock);
> +
> + return vruntime;
> +}
> +
> +static ktime_t drm_sched_entity_update_vruntime(struct drm_sched_entity
> *entity)
> +{
> + /*
> + * Core part of the CFS-like algorithm is that the virtual runtime of
> + * lower priority tasks should grow quicker than the higher priority
> + * ones, so that when we then schedule entities with the aim of keeping
> + * their accumulated virtual time balanced, we can approach fair
> + * distribution of GPU time.
> + *
> + * For converting the real GPU time into virtual we pick some
> + * multipliers with the idea to achieve the following GPU time
> + * distribution:
> + *
> + * - Kernel priority gets roughly 2x GPU time compared to high.
> + * - High gets ~4x relative to normal.
> + * - Normal gets ~8x relative to low.
> + */
I can make sense of 2 and 4, but how does 7 match to 8?
P.
> + static const unsigned int shift[] = {
> + [DRM_SCHED_PRIORITY_KERNEL] = 1,
> + [DRM_SCHED_PRIORITY_HIGH] = 2,
> + [DRM_SCHED_PRIORITY_NORMAL] = 4,
> + [DRM_SCHED_PRIORITY_LOW] = 7,
> + };
> + struct drm_sched_entity_stats *stats = entity->stats;
> + ktime_t runtime, prev;
> +
> + spin_lock(&stats->lock);
> + prev = stats->prev_runtime;
> + runtime = stats->runtime;
> + stats->prev_runtime = runtime;
> + runtime = ktime_add_ns(stats->vruntime,
> + ktime_to_ns(ktime_sub(runtime, prev)) <<
> + shift[entity->priority]);
> + stats->vruntime = runtime;
> + spin_unlock(&stats->lock);
> +
> + return runtime;
> +}
> +
> +static ktime_t drm_sched_entity_get_job_ts(struct drm_sched_entity *entity)
> +{
> + return drm_sched_entity_update_vruntime(entity);
> }
>
> /**
> @@ -98,8 +261,14 @@ drm_sched_rq_add_entity(struct drm_sched_entity *entity,
> ktime_t ts)
> list_add_tail(&entity->list, &rq->entities);
> }
>
> - if (drm_sched_policy == DRM_SCHED_POLICY_RR)
> + if (drm_sched_policy == DRM_SCHED_POLICY_FAIR) {
> + ts = drm_sched_rq_get_min_vruntime(rq);
> + ts = drm_sched_entity_restore_vruntime(entity, ts,
> + rq->head_prio);
> + } else if (drm_sched_policy == DRM_SCHED_POLICY_RR) {
> ts = entity->rr_ts;
> + }
> +
> drm_sched_rq_update_fifo_locked(entity, rq, ts);
>
> spin_unlock(&rq->lock);
> @@ -171,7 +340,9 @@ void drm_sched_rq_pop_entity(struct drm_sched_entity
> *entity)
> if (next_job) {
> ktime_t ts;
>
> - if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> + if (drm_sched_policy == DRM_SCHED_POLICY_FAIR)
> + ts = drm_sched_entity_get_job_ts(entity);
> + else if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> ts = next_job->submit_ts;
> else
> ts = drm_sched_rq_next_rr_ts(rq, entity);
> @@ -179,6 +350,13 @@ void drm_sched_rq_pop_entity(struct drm_sched_entity
> *entity)
> drm_sched_rq_update_fifo_locked(entity, rq, ts);
> } else {
> drm_sched_rq_remove_fifo_locked(entity, rq);
> +
> + if (drm_sched_policy == DRM_SCHED_POLICY_FAIR) {
> + ktime_t min_vruntime;
> +
> + min_vruntime = drm_sched_rq_get_min_vruntime(rq);
> + drm_sched_entity_save_vruntime(entity, min_vruntime);
> + }
> }
> spin_unlock(&rq->lock);
> spin_unlock(&entity->lock);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index be382cacabb5..f1d1a27b970a 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -63,6 +63,7 @@ struct drm_file;
> * to an array, and as such should start at 0.
> */
> enum drm_sched_priority {
> + DRM_SCHED_PRIORITY_INVALID = -1, /* Internal marker - do not use. */
> DRM_SCHED_PRIORITY_KERNEL,
> DRM_SCHED_PRIORITY_HIGH,
> DRM_SCHED_PRIORITY_NORMAL,
> @@ -150,6 +151,11 @@ struct drm_sched_entity {
> */
> enum drm_sched_priority priority;
>
> + /**
> + * @rq_priority: Run-queue priority
> + */
> + enum drm_sched_priority rq_priority;
> +
> /**
> * @rr_ts:
> *
> @@ -254,10 +260,11 @@ struct drm_sched_entity {
> * struct drm_sched_rq - queue of entities to be scheduled.
> *
> * @sched: the scheduler to which this rq belongs to.
> - * @lock: protects @entities, @rb_tree_root and @rr_ts.
> + * @lock: protects @entities, @rb_tree_root, @rr_ts and @head_prio.
> * @rr_ts: monotonically incrementing fake timestamp for RR mode.
> * @entities: list of the entities to be scheduled.
> * @rb_tree_root: root of time based priority queue of entities for FIFO
> scheduling
> + * @head_prio: priority of the top tree element.
> *
> * Run queue is a set of entities scheduling command submissions for
> * one specific ring. It implements the scheduling policy that selects
> @@ -271,6 +278,7 @@ struct drm_sched_rq {
> ktime_t rr_ts;
> struct list_head entities;
> struct rb_root_cached rb_tree_root;
> + enum drm_sched_priority head_prio;
> };
>
> /**
> @@ -563,8 +571,10 @@ struct drm_sched_backend_ops {
> * @credit_count: the current credit count of this scheduler
> * @timeout: the time after which a job is removed from the scheduler.
> * @name: name of the ring for which this scheduler is being used.
> - * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
> - * as there's usually one run-queue per priority, but could be
> less.
> + * @num_user_rqs: Number of run-queues. This is at most
> + * DRM_SCHED_PRIORITY_COUNT, as there's usually one run-queue
> per
> + * priority, but could be less.
> + * @num_rqs: Equal to @num_user_rqs for FIFO and RR and 1 for the FAIR
> policy.
> * @sched_rq: An allocated array of run-queues of size @num_rqs;
> * @job_scheduled: once drm_sched_entity_flush() is called the scheduler
> * waits on this wait queue until all the scheduled jobs are
> @@ -597,6 +607,7 @@ struct drm_gpu_scheduler {
> long timeout;
> const char *name;
> u32 num_rqs;
> + u32 num_user_rqs;
> struct drm_sched_rq **sched_rq;
> wait_queue_head_t job_scheduled;
> atomic64_t job_id_count;