deadline: Hierarchical scheduling with DL on top of RT

Alessio Balsini Fri, 31 Mar 2017 11:46:33 -0700

The runtime of RT tasks controlled by CGroups are enforced by the
SCHED_DEADLINE scheduling class, based on the runtime and period (the
deadline is set equal to the period) parameters.


sched_dl_entity may also represent a group of RT tasks, providing a rt_rq.

Signed-off-by: Andrea Parri <[email protected]>
Signed-off-by: Luca Abeni <[email protected]>
Cc: Tommaso Cucinotta <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Alessio Balsini <[email protected]>
---
 include/linux/sched.h    |  13 +-
 kernel/sched/autogroup.c |   4 +-
 kernel/sched/core.c      |  86 ++++++++--
 kernel/sched/deadline.c  | 174 ++++++++++++++++----
 kernel/sched/rt.c        | 407 ++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h     | 142 +++++++++++++++--
 6 files changed, 611 insertions(+), 215 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..fdd62f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,7 +402,7 @@ struct sched_rt_entity {
 
        struct sched_rt_entity          *back;
 #ifdef CONFIG_RT_GROUP_SCHED
-       struct sched_rt_entity          *parent;
+       struct sched_dl_entity          *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                    *rt_rq;
        /* rq "owned" by this entity/group: */
@@ -455,6 +455,17 @@ struct sched_dl_entity {
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                  dl_timer;
+
+/*
+ * An instance of a sched_dl_entity may represent a group of tasks, therefore
+ * it requires:
+ * - dl_rq: the rq on which this entity is queued;
+ * - rt_rq: the rq owned by this entity;
+ */
+#ifdef CONFIG_RT_GROUP_SCHED
+       struct dl_rq                    *dl_rq;
+       struct rt_rq                    *my_q;
+#endif
 };
 
 union rcu_special {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489..e14acb4 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -30,7 +30,7 @@ static inline void autogroup_destroy(struct kref *kref)
 
 #ifdef CONFIG_RT_GROUP_SCHED
        /* We've redirected RT tasks to the root task group... */
-       ag->tg->rt_se = NULL;
+       ag->tg->dl_se = NULL;
        ag->tg->rt_rq = NULL;
 #endif
        sched_offline_group(ag->tg);
@@ -88,7 +88,7 @@ static inline struct autogroup *autogroup_create(void)
         * the policy change to proceed.
         */
        free_rt_sched_group(tg);
-       tg->rt_se = root_task_group.rt_se;
+       tg->dl_se = root_task_group.dl_se;
        tg->rt_rq = root_task_group.rt_rq;
 #endif
        tg->autogroup = ag;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d4cce4..b139719 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -902,6 +902,9 @@ void check_preempt_curr(struct rq *rq, struct task_struct 
*p, int flags)
 {
        const struct sched_class *class;
 
+       if (is_dl_group(rt_rq_of_se(&p->rt)) && task_has_rt_policy(p))
+               resched_curr(rq);
+
        if (p->sched_class == rq->curr->sched_class) {
                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
        } else {
@@ -2165,6 +2168,9 @@ void __dl_clear_params(struct task_struct *p)
 
        dl_se->dl_throttled = 0;
        dl_se->dl_yielded = 0;
+#ifdef CONFIG_RT_GROUP_SCHED
+       dl_se->my_q = NULL;
+#endif
 }
 
 /*
@@ -4261,7 +4267,8 @@ static int __sched_setscheduler(struct task_struct *p,
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
                 */
-               if (rt_policy(policy) &&
+               if (dl_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->dl_bandwidth.dl_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
                        task_rq_unlock(rq, p, &rf);
                        return -EPERM;
@@ -5987,7 +5994,7 @@ void __init sched_init(void)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+               root_task_group.dl_se = (struct sched_dl_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 
                root_task_group.rt_rq = (struct rt_rq **)ptr;
@@ -6010,6 +6017,10 @@ void __init sched_init(void)
        init_defrootdomain();
 #endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       init_dl_bandwidth(&root_task_group.dl_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
        task_group_cache = KMEM_CACHE(task_group, 0);
@@ -6460,9 +6471,10 @@ static int tg_rt_schedulable(struct task_group *tg, void 
*data)
        struct task_group *child;
        unsigned long total, sum = 0;
        u64 period, runtime;
+       unsigned long flags;
 
-       period = 0;
-       runtime = 0;
+       period  = tg->dl_bandwidth.dl_period;
+       runtime = tg->dl_bandwidth.dl_runtime;
 
        if (tg == d->tg) {
                period = d->rt_period;
@@ -6478,7 +6490,7 @@ static int tg_rt_schedulable(struct task_group *tg, void 
*data)
        /*
         * Ensure we don't starve existing RT tasks.
         */
-       if (!runtime && tg_has_rt_tasks(tg))
+       if (dl_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
                return -EBUSY;
 
        total = to_ratio(period, runtime);
@@ -6489,12 +6501,27 @@ static int tg_rt_schedulable(struct task_group *tg, 
void *data)
        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
                return -EINVAL;
 
+       if (tg == &root_task_group) {
+               int cpus = num_online_cpus();
+               struct dl_bw *dl_b = dl_bw_of(smp_processor_id());
+
+               raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+               if (dl_b->bw != -1 &&
+                   dl_b->bw * cpus < dl_b->total_bw + total * cpus) {
+                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                       return -EBUSY;
+               }
+
+               raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+       }
+
        /*
         * The sum of our children's runtime should not exceed our own.
         */
        list_for_each_entry_rcu(child, &tg->children, siblings) {
-               period = 0;
-               runtime = 0;
+               period  = child->dl_bandwidth.dl_period;
+               runtime = child->dl_bandwidth.dl_runtime;
 
                if (child == d->tg) {
                        period = d->rt_period;
@@ -6549,6 +6576,33 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
        if (err)
                goto unlock;
 
+       raw_spin_lock_irq(&tg->dl_bandwidth.dl_runtime_lock);
+       tg->dl_bandwidth.dl_period  = rt_period;
+       tg->dl_bandwidth.dl_runtime = rt_runtime;
+
+       if (tg == &root_task_group)
+               goto unlock_bandwidth;
+
+       for_each_possible_cpu(i) {
+               struct sched_dl_entity *dl_se = tg->dl_se[i];
+               struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
+
+               raw_spin_lock_irq(&rq->lock);
+               dl_se->dl_runtime  = rt_runtime;
+               dl_se->dl_period   = rt_period;
+               dl_se->dl_deadline = dl_se->dl_period;
+               dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+               if (!((s64)(rt_period - rt_runtime) >= 0) ||
+                   !(rt_runtime >= (2 << (DL_SCALE - 1)))) {
+                       raw_spin_unlock_irq(&rq->lock);
+                       continue;
+               }
+
+               raw_spin_unlock_irq(&rq->lock);
+       }
+unlock_bandwidth:
+       raw_spin_unlock_irq(&tg->dl_bandwidth.dl_runtime_lock);
 unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
@@ -6560,7 +6614,7 @@ static int sched_group_set_rt_runtime(struct task_group 
*tg, long rt_runtime_us)
 {
        u64 rt_runtime, rt_period;
 
-       rt_period = 0;
+       rt_period  = tg->dl_bandwidth.dl_period;
        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
        if (rt_runtime_us < 0)
                rt_runtime = RUNTIME_INF;
@@ -6572,10 +6626,10 @@ static long sched_group_rt_runtime(struct task_group 
*tg)
 {
        u64 rt_runtime_us;
 
-       if (0 == RUNTIME_INF)
+       if (tg->dl_bandwidth.dl_runtime == RUNTIME_INF)
                return -1;
 
-       rt_runtime_us = 0;
+       rt_runtime_us = tg->dl_bandwidth.dl_runtime;
        do_div(rt_runtime_us, NSEC_PER_USEC);
        return rt_runtime_us;
 }
@@ -6585,7 +6639,7 @@ static int sched_group_set_rt_period(struct task_group 
*tg, u64 rt_period_us)
        u64 rt_runtime, rt_period;
 
        rt_period = rt_period_us * NSEC_PER_USEC;
-       rt_runtime = 0;
+       rt_runtime = tg->dl_bandwidth.dl_runtime;
 
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -6594,7 +6648,7 @@ static long sched_group_rt_period(struct task_group *tg)
 {
        u64 rt_period_us;
 
-       rt_period_us = 0;
+       rt_period_us = tg->dl_bandwidth.dl_period;
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
@@ -6617,7 +6671,7 @@ static int sched_rt_global_constraints(void)
 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
        /* Don't accept realtime tasks when there is no way for them to run */
-       if (rt_task(tsk))
+       if (rt_task(tsk) && tg->dl_bandwidth.dl_runtime == 0)
                return 0;
 
        return 1;
@@ -6785,6 +6839,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
                return &root_task_group.css;
        }
 
+       /* Do not allow cpu_cgroup hierachies with depth greater than 2. */
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (parent != &root_task_group)
+               return ERR_PTR(-EINVAL);
+#endif
+
        tg = sched_create_group(parent);
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1af6219..9a1988b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -20,8 +20,17 @@
 
 struct dl_bandwidth def_dl_bandwidth;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+#define dl_entity_is_task(dl_se) (!(dl_se)->my_q)
+#define rt_rq_of_dl_entity(dl_se) ((dl_se)->my_q)
+#else
+#define dl_entity_is_task(dl_se) (1)
+#define rt_rq_of_dl_entity(dl_se) (NULL)
+#endif
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
+       BUG_ON(!dl_entity_is_task(dl_se));
        return container_of(dl_se, struct task_struct, dl);
 }
 
@@ -30,6 +39,14 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
        return container_of(dl_rq, struct rq, dl);
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+       return dl_se->dl_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
 static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
 {
        struct task_struct *p = dl_task_of(dl_se);
@@ -37,6 +54,7 @@ static inline struct dl_rq *dl_rq_of_se(struct 
sched_dl_entity *dl_se)
 
        return &rq->dl;
 }
+#endif
 
 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 {
@@ -119,7 +137,11 @@ static inline void dl_clear_overload(struct rq *rq)
 
 static void update_dl_migration(struct dl_rq *dl_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+#else
        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+#endif
                if (!dl_rq->overloaded) {
                        dl_set_overload(rq_of_dl_rq(dl_rq));
                        dl_rq->overloaded = 1;
@@ -520,11 +542,11 @@ static inline u64 dl_next_period(struct sched_dl_entity 
*dl_se)
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct task_struct *p)
+int start_dl_timer(struct sched_dl_entity *dl_se)
 {
-       struct sched_dl_entity *dl_se = &p->dl;
        struct hrtimer *timer = &dl_se->dl_timer;
-       struct rq *rq = task_rq(p);
+       struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+       struct rq *rq = rq_of_dl_rq(dl_rq);
        ktime_t now, act;
        s64 delta;
 
@@ -558,7 +580,11 @@ static int start_dl_timer(struct task_struct *p)
         * and observe our state.
         */
        if (!hrtimer_is_queued(timer)) {
-               get_task_struct(p);
+               if (dl_entity_is_task(dl_se)) {
+                       struct task_struct *p = dl_task_of(dl_se);
+
+                       get_task_struct(p);
+               }
                hrtimer_start(timer, act, HRTIMER_MODE_ABS);
        }
 
@@ -583,10 +609,43 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
*timer)
        struct sched_dl_entity *dl_se = container_of(timer,
                                                     struct sched_dl_entity,
                                                     dl_timer);
-       struct task_struct *p = dl_task_of(dl_se);
+       struct task_struct *p;
        struct rq_flags rf;
        struct rq *rq;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       /* Replenish dl group and check for preemption. */
+       if (!dl_entity_is_task(dl_se)) {
+               struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+               rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+               raw_spin_lock(&rq->lock);
+
+
+               sched_clock_tick();
+               update_rq_clock(rq);
+
+               dl_se->dl_throttled = 0;
+               if (rt_rq->rt_nr_running) {
+                       enqueue_dl_entity(dl_se, dl_se, ENQUEUE_REPLENISH);
+
+                       resched_curr(rq);
+#ifdef CONFIG_SMP
+                       if (has_pushable_dl_tasks(rq))
+                               push_dl_task(rq);
+#endif
+               } else {
+                       replenish_dl_entity(dl_se, dl_se);
+               }
+
+               raw_spin_unlock(&rq->lock);
+
+               return HRTIMER_NORESTART;
+       }
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+       p  = dl_task_of(dl_se);
        rq = task_rq_lock(p, &rf);
 
        /*
@@ -720,13 +779,12 @@ static inline void dl_check_constrained_dl(struct 
sched_dl_entity *dl_se)
 
        if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
            dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-               if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+               if (unlikely(dl_se->dl_boosted || !start_dl_timer(&p->dl)))
                        return;
                dl_se->dl_throttled = 1;
        }
 }
 
-static
 int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 {
        return (dl_se->runtime <= 0);
@@ -780,7 +838,7 @@ static void update_curr_dl(struct rq *rq)
        if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
                dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
-               if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+               if (unlikely(dl_se->dl_boosted || !start_dl_timer(&curr->dl)))
                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
                if (!is_leftmost(curr, &rq->dl))
@@ -833,29 +891,39 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, 
u64 deadline) {}
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-       int prio = dl_task_of(dl_se)->prio;
        u64 deadline = dl_se->deadline;
 
-       WARN_ON(!dl_prio(prio));
-       dl_rq->dl_nr_running++;
-       add_nr_running(rq_of_dl_rq(dl_rq), 1);
+       if (dl_entity_is_task(dl_se)) {
+               dl_rq->dl_nr_running++;
+               add_nr_running(rq_of_dl_rq(dl_rq), 1);
+               inc_dl_migration(dl_se, dl_rq);
+       } else {
+               struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+               add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+       }
 
        inc_dl_deadline(dl_rq, deadline);
-       inc_dl_migration(dl_se, dl_rq);
 }
 
 static inline
 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-       int prio = dl_task_of(dl_se)->prio;
+#ifdef CONFIG_RT_GROUP_SCHED
+       WARN_ON(!dl_rq->dl_nr_total);
+#endif
 
-       WARN_ON(!dl_prio(prio));
-       WARN_ON(!dl_rq->dl_nr_running);
-       dl_rq->dl_nr_running--;
-       sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+       if (dl_entity_is_task(dl_se)) {
+               dl_rq->dl_nr_running--;
+               sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+               dec_dl_migration(dl_se, dl_rq);
+       } else {
+               struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+               sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+       }
 
        dec_dl_deadline(dl_rq, dl_se->deadline);
-       dec_dl_migration(dl_se, dl_rq);
 }
 
 static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
@@ -884,7 +952,9 @@ static void __enqueue_dl_entity(struct sched_dl_entity 
*dl_se)
 
        rb_link_node(&dl_se->rb_node, parent, link);
        rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
-
+#ifdef CONFIG_RT_GROUP_SCHED
+       dl_rq->dl_nr_total++;
+#endif
        inc_dl_tasks(dl_se, dl_rq);
 }
 
@@ -906,9 +976,12 @@ static void __dequeue_dl_entity(struct sched_dl_entity 
*dl_se)
        RB_CLEAR_NODE(&dl_se->rb_node);
 
        dec_dl_tasks(dl_se, dl_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+       dl_rq->dl_nr_total--;
+#endif
 }
 
-static void
+void
 enqueue_dl_entity(struct sched_dl_entity *dl_se,
                  struct sched_dl_entity *pi_se, int flags)
 {
@@ -927,7 +1000,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
        __enqueue_dl_entity(dl_se);
 }
 
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 {
        __dequeue_dl_entity(dl_se);
 }
@@ -1120,12 +1193,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct 
task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-       hrtick_start(rq, p->dl.runtime);
+       hrtick_start(rq, dl_se->runtime);
 }
 #else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
 }
 #endif
@@ -1176,14 +1249,35 @@ pick_next_task_dl(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
        if (prev->sched_class == &dl_sched_class)
                update_curr_dl(rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (unlikely(!dl_rq->dl_nr_total))
+               return NULL;
+#else
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
-
-       put_prev_task(rq, prev);
+#endif
 
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
 
+       put_prev_task(rq, prev);
+
+       if (!dl_entity_is_task(dl_se)) {
+               struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+               struct sched_rt_entity *rt_se;
+
+               rt_se = pick_next_rt_entity(rq, rt_rq);
+               p = container_of(rt_se, struct task_struct, rt);
+               p->se.exec_start = rq_clock_task(rq);
+
+               dequeue_pushable_task(rt_rq_of_se(&p->rt), p);
+
+               if (hrtick_enabled(rq))
+                       start_hrtick_dl(rq, dl_se);
+
+               return p;
+       }
+
        p = dl_task_of(dl_se);
        p->se.exec_start = rq_clock_task(rq);
 
@@ -1191,7 +1285,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
        dequeue_pushable_dl_task(rq, p);
 
        if (hrtick_enabled(rq))
-               start_hrtick_dl(rq, p);
+               start_hrtick_dl(rq, &p->dl);
 
        queue_push_tasks(rq);
 
@@ -1217,7 +1311,7 @@ static void task_tick_dl(struct rq *rq, struct 
task_struct *p, int queued)
         */
        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
            is_leftmost(p, &rq->dl))
-               start_hrtick_dl(rq, p);
+               start_hrtick_dl(rq, &p->dl);
 }
 
 static void task_fork_dl(struct task_struct *p)
@@ -1632,14 +1726,21 @@ static void pull_dl_task(struct rq *this_rq)
  */
 static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
-       if (!task_running(rq, p) &&
-           !test_tsk_need_resched(rq->curr) &&
-           p->nr_cpus_allowed > 1 &&
-           dl_task(rq->curr) &&
+       if (task_running(rq, p) ||
+           test_tsk_need_resched(rq->curr) ||
+           p->nr_cpus_allowed <= 1)
+               return;
+
+       if (dl_task(rq->curr) &&
            (rq->curr->nr_cpus_allowed < 2 ||
             !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
                push_dl_tasks(rq);
        }
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (rt_task(rq->curr) && is_dl_group(rq->curr->rt.rt_rq))
+               push_dl_tasks(rq);
+#endif
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
@@ -1715,7 +1816,7 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
         * SCHED_DEADLINE until the deadline passes, the timer will reset the
         * task.
         */
-       if (!start_dl_timer(p))
+       if (!start_dl_timer(&p->dl))
                __dl_clear_params(p);
 
        /*
@@ -1723,10 +1824,15 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
         * this is the right place to try to pull some other one
         * from an overloaded cpu, if any.
         */
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (!rq->dl.dl_nr_total)
+               queue_pull_task(rq);
+#else
        if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
                return;
 
        queue_pull_task(rq);
+#endif
 }
 
 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e72ccb8..f38bd4b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -46,49 +46,37 @@ void init_rt_rq(struct rt_rq *rt_rq)
 
 #ifdef CONFIG_RT_GROUP_SCHED
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
-       return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
        return rt_rq->rq;
 }
 
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-       return rt_se->rt_rq;
-}
-
-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
-       struct rt_rq *rt_rq = rt_se->rt_rq;
-
-       return rt_rq->rq;
-}
-
 void free_rt_sched_group(struct task_group *tg)
 {
+       unsigned long flags;
        int i;
 
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
                        kfree(tg->rt_rq[i]);
-               if (tg->rt_se)
-                       kfree(tg->rt_se[i]);
+               if (tg->dl_se) {
+                       raw_spin_lock_irqsave(&cpu_rq(i)->lock, flags);
+                       if (!tg->dl_se[i]->dl_throttled)
+                               dequeue_dl_entity(tg->dl_se[i]);
+                       raw_spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
+
+                       hrtimer_cancel(&tg->dl_se[i]->dl_timer);
+                       kfree(tg->dl_se[i]);
+               }
        }
 
        kfree(tg->rt_rq);
-       kfree(tg->rt_se);
+       kfree(tg->dl_se);
 }
 
 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu,
-               struct sched_rt_entity *parent)
+               struct sched_dl_entity *dl_se, int cpu,
+               struct sched_dl_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
 
@@ -97,47 +85,56 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq 
*rt_rq,
        rt_rq->tg = tg;
 
        tg->rt_rq[cpu] = rt_rq;
-       tg->rt_se[cpu] = rt_se;
+       tg->dl_se[cpu] = dl_se;
 
-       if (!rt_se)
+       if (!dl_se)
                return;
 
-       if (!parent)
-               rt_se->rt_rq = &rq->rt;
-       else
-               rt_se->rt_rq = parent->my_q;
-
-       rt_se->my_q = rt_rq;
-       rt_se->parent = parent;
-       INIT_LIST_HEAD(&rt_se->run_list);
+       dl_se->dl_rq = &rq->dl;
+       dl_se->my_q = rt_rq;
+       RB_CLEAR_NODE(&dl_se->rb_node);
 }
 
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se;
+       struct sched_dl_entity *dl_se;
        int i;
 
-       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+       tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
        if (!tg->rt_rq)
                goto err;
-       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-       if (!tg->rt_se)
+       tg->dl_se = kcalloc(nr_cpu_ids, sizeof(dl_se), GFP_KERNEL);
+       if (!tg->dl_se)
                goto err;
 
+       init_dl_bandwidth(&tg->dl_bandwidth,
+                       def_dl_bandwidth.dl_period, 0);
+
        for_each_possible_cpu(i) {
                rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
 
-               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+               dl_se = kzalloc_node(sizeof(struct sched_dl_entity),
                                     GFP_KERNEL, cpu_to_node(i));
-               if (!rt_se)
+               if (!dl_se)
                        goto err_free_rq;
 
                init_rt_rq(rt_rq);
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+               rt_rq->rq = cpu_rq(i);
+
+               init_dl_task_timer(dl_se);
+
+               dl_se->dl_runtime = tg->dl_bandwidth.dl_runtime;
+               dl_se->dl_period = tg->dl_bandwidth.dl_period;
+               dl_se->dl_deadline = dl_se->dl_period;
+               dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+               dl_se->dl_throttled = 0;
+
+               init_tg_rt_entry(tg, rt_rq, dl_se, i, parent->dl_se[i]);
        }
 
        return 1;
@@ -150,30 +147,11 @@ int alloc_rt_sched_group(struct task_group *tg, struct 
task_group *parent)
 
 #else /* CONFIG_RT_GROUP_SCHED */
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-       return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
        return container_of(rt_rq, struct rq, rt);
 }
 
-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
-       struct task_struct *p = rt_task_of(rt_se);
-
-       return task_rq(p);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-       struct rq *rq = rq_of_rt_se(rt_se);
-
-       return &rq->rt;
-}
-
 void free_rt_sched_group(struct task_group *tg) { }
 
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -228,7 +206,7 @@ static inline void rt_clear_overload(struct rq *rq)
 
 static void update_rt_migration(struct rt_rq *rt_rq)
 {
-       if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+       if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_running > 1) {
                if (!rt_rq->overloaded) {
                        rt_set_overload(rq_of_rt_rq(rt_rq));
                        rt_rq->overloaded = 1;
@@ -243,8 +221,6 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, 
struct rt_rq *rt_rq)
 {
        struct task_struct *p;
 
-       return;
-
        p = rt_task_of(rt_se);
 
        if (p->nr_cpus_allowed > 1)
@@ -258,18 +234,16 @@ static void dec_rt_migration(struct sched_rt_entity 
*rt_se, struct rt_rq *rt_rq)
        struct task_struct *p;
 
        p = rt_task_of(rt_se);
-       rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
-       rt_rq->rt_nr_total--;
        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
 
        update_rt_migration(rt_rq);
 }
 
-static inline int has_pushable_tasks(struct rq *rq)
+static inline int has_pushable_tasks(struct rt_rq *rt_rq)
 {
-       return !plist_head_empty(&rq->rt.pushable_tasks);
+       return !plist_head_empty(&rt_rq->pushable_tasks);
 }
 
 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
@@ -280,7 +254,7 @@ static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
-       if (!has_pushable_tasks(rq))
+       if (!has_pushable_tasks(&rq->rt))
                return;
 
        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), 
push_rt_tasks);
@@ -291,37 +265,35 @@ static inline void queue_pull_task(struct rq *rq)
        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), 
pull_rt_task);
 }
 
-static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+static void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
-       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
        plist_node_init(&p->pushable_tasks, p->prio);
-       plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_add(&p->pushable_tasks, &rt_rq->pushable_tasks);
 
        /* Update the highest prio pushable task */
-       if (p->prio < rq->rt.highest_prio.next)
-               rq->rt.highest_prio.next = p->prio;
+       if (p->prio < rt_rq->highest_prio.next)
+               rt_rq->highest_prio.next = p->prio;
 }
 
-static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+#ifdef CONFIG_RT_GROUP_SCHED
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
-       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
 
        /* Update the new highest prio pushable task */
-       if (has_pushable_tasks(rq)) {
-               p = plist_first_entry(&rq->rt.pushable_tasks,
+       if (has_pushable_tasks(rt_rq)) {
+               p = plist_first_entry(&rt_rq->pushable_tasks,
                                      struct task_struct, pushable_tasks);
-               rq->rt.highest_prio.next = p->prio;
+               rt_rq->highest_prio.next = p->prio;
        } else
-               rq->rt.highest_prio.next = MAX_RT_PRIO;
+               rt_rq->highest_prio.next = MAX_RT_PRIO;
 }
-
+#endif
 #else
 
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+static inline
+void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
 }
 
@@ -347,33 +319,17 @@ static inline void pull_rt_task(struct rq *this_rq)
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-       return rt_se->on_rq;
+       return !list_empty(&rt_se->run_list);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define for_each_sched_rt_entity(rt_se) \
-       for (; rt_se; rt_se = rt_se->parent)
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int 
flags);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int 
flags);
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
-       for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
-       for (; rt_se; rt_se = NULL)
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
        return rt_task_of(rt_se)->prio;
@@ -386,6 +342,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 static void update_curr_rt(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
+       struct rt_rq *rt_rq = rt_rq_of_se(&curr->rt);
        u64 delta_exec;
 
        if (curr->sched_class != &rt_sched_class)
@@ -408,6 +365,34 @@ static void update_curr_rt(struct rq *rq)
        cpuacct_charge(curr, delta_exec);
 
        sched_rt_avg_update(rq, delta_exec);
+
+       if (!dl_bandwidth_enabled())
+               return;
+
+       if (is_dl_group(rt_rq)) {
+               struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+               if (dl_se->dl_throttled) {
+                       resched_curr(rq);
+                       return;
+               }
+
+               BUG_ON(rt_rq->rt_nr_running > rq->nr_running);
+               dl_se->runtime -= delta_exec;
+
+               /* A group exhausts the budget. */
+               if (dl_runtime_exceeded(dl_se)) {
+                       dequeue_dl_entity(dl_se);
+
+                       if (likely(start_dl_timer(dl_se)))
+                               dl_se->dl_throttled = 1;
+                       else
+                               enqueue_dl_entity(dl_se, dl_se,
+                                                 ENQUEUE_REPLENISH);
+
+                       resched_curr(rq);
+               }
+       }
 }
 
 #if defined CONFIG_SMP
@@ -417,7 +402,7 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int 
prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
 
-       if (&rq->rt != rt_rq)
+       if (is_dl_group(rt_rq))
                return;
 
        if (rq->online && prio < prev_prio)
@@ -429,7 +414,7 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int 
prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
 
-       if (&rq->rt != rt_rq)
+       if (is_dl_group(rt_rq))
                return;
 
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
@@ -445,12 +430,15 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int 
prev_prio) {}
 
 #endif /* CONFIG_SMP */
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_SMP)
 static void
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
        int prev_prio = rt_rq->highest_prio.curr;
 
+       if (is_dl_group(rt_rq))
+               return;
+
        if (prio < prev_prio)
                rt_rq->highest_prio.curr = prio;
 
@@ -462,6 +450,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
        int prev_prio = rt_rq->highest_prio.curr;
 
+       if (is_dl_group(rt_rq))
+               return;
+
        if (rt_rq->rt_nr_running) {
 
                WARN_ON(prio < prev_prio);
@@ -488,7 +479,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+#endif /* CONFIG_SMP && !CONFIG_RT_GROUP_SCHED */
 
 static inline
 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -516,6 +507,16 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct 
rt_rq *rt_rq)
        rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
        inc_rt_prio(rt_rq, prio);
+
+       if (is_dl_group(rt_rq)) {
+               struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+               if (!dl_se->dl_throttled)
+                       add_nr_running(rq_of_rt_rq(rt_rq), 1);
+       } else {
+               add_nr_running(rq_of_rt_rq(rt_rq), 1);
+       }
+
        inc_rt_migration(rt_se, rt_rq);
 }
 
@@ -523,11 +524,18 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-       WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
        rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+       if (is_dl_group(rt_rq)) {
+               struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+               if (!dl_se->dl_throttled)
+                       sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+       } else {
+               sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+       }
        dec_rt_migration(rt_se, rt_rq);
 }
 
@@ -596,24 +604,49 @@ static void
 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 
        if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
 
-       enqueue_rt_entity(rt_se, flags);
+       /* Task arriving in an idle group of tasks. */
+       if (is_dl_group(rt_rq) && (rt_rq->rt_nr_running == 0)) {
+               struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+               if (!dl_se->dl_throttled) {
+                       enqueue_dl_entity(dl_se, dl_se, flags);
+                       resched_curr(rq);
+               }
+       }
+
+       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
-               enqueue_pushable_task(rq, p);
+               enqueue_pushable_task(rt_rq, p);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se, flags);
 
-       dequeue_pushable_task(rq, p);
+       dequeue_pushable_task(rt_rq_of_se(rt_se), p);
+
+       /* Last task of the task group. */
+       if (is_dl_group(rt_rq) && !rt_rq->rt_nr_running) {
+               struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+#ifndef CONFIG_RT_GROUP_SCHED
+               queue_pull_task(rq);
+#endif
+               if (!rt_rq->rt_nr_running) {
+                       dequeue_dl_entity(dl_se);
+                       resched_curr(rq);
+               }
+       }
 }
 
 /*
@@ -639,10 +672,8 @@ static void requeue_task_rt(struct rq *rq, struct 
task_struct *p, int head)
        struct sched_rt_entity *rt_se = &p->rt;
        struct rt_rq *rt_rq;
 
-       for_each_sched_rt_entity(rt_se) {
-               rt_rq = rt_rq_of_se(rt_se);
-               requeue_rt_entity(rt_rq, rt_se, head);
-       }
+       rt_rq = rt_rq_of_se(rt_se);
+       requeue_rt_entity(rt_rq, rt_se, head);
 }
 
 static void yield_task_rt(struct rq *rq)
@@ -743,6 +774,30 @@ static void check_preempt_equal_prio(struct rq *rq, struct 
task_struct *p)
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int 
flags)
 {
+       if (is_dl_group(rt_rq_of_se(&p->rt)) &&
+           is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+               struct sched_dl_entity *dl_se, *curr_dl_se;
+
+               dl_se = dl_group_of(rt_rq_of_se(&p->rt));
+               curr_dl_se = dl_group_of(rt_rq_of_se(&rq->curr->rt));
+
+               if (dl_entity_preempt(dl_se, curr_dl_se)) {
+                       resched_curr(rq);
+                       return;
+               } else if (!dl_entity_preempt(curr_dl_se, dl_se)) {
+                       if (p->prio < rq->curr->prio) {
+                               resched_curr(rq);
+                               return;
+                       }
+               }
+               return;
+       } else if (is_dl_group(rt_rq_of_se(&p->rt))) {
+               resched_curr(rq);
+               return;
+       } else if (is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+               return;
+       }
+
        if (p->prio < rq->curr->prio) {
                resched_curr(rq);
                return;
@@ -766,7 +821,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 #endif
 }
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
                                                   struct rt_rq *rt_rq)
 {
        struct rt_prio_array *array = &rt_rq->active;
@@ -831,7 +886,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
        if (prev->sched_class == &rt_sched_class)
                update_curr_rt(rq);
 
-       if (!rt_rq->rt_queued)
+       if (!rt_rq->rt_nr_running)
                return NULL;
 
        put_prev_task(rq, prev);
@@ -839,7 +894,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
        p = _pick_next_task_rt(rq);
 
        /* The running task is never eligible for pushing */
-       dequeue_pushable_task(rq, p);
+       dequeue_pushable_task(rt_rq, p);
 
        queue_push_tasks(rq);
 
@@ -848,6 +903,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
+       struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
        update_curr_rt(rq);
 
        /*
@@ -855,7 +912,8 @@ static void put_prev_task_rt(struct rq *rq, struct 
task_struct *p)
         * if it is still active
         */
        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
-               enqueue_pushable_task(rq, p);
+               enqueue_pushable_task(rt_rq, p);
+
 }
 
 #ifdef CONFIG_SMP
@@ -863,9 +921,9 @@ static void put_prev_task_rt(struct rq *rq, struct 
task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+static int pick_rt_task(struct rt_rq *rt_rq, struct task_struct *p, int cpu)
 {
-       if (!task_running(rq, p) &&
+       if (!task_running(rq_of_rt_rq(rt_rq), p) &&
            cpumask_test_cpu(cpu, &p->cpus_allowed))
                return 1;
        return 0;
@@ -875,16 +933,17 @@ static int pick_rt_task(struct rq *rq, struct task_struct 
*p, int cpu)
  * Return the highest pushable rq's task, which is suitable to be executed
  * on the cpu, NULL otherwise
  */
-static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
+static
+struct task_struct *pick_highest_pushable_task(struct rt_rq *rt_rq, int cpu)
 {
-       struct plist_head *head = &rq->rt.pushable_tasks;
+       struct plist_head *head = &rt_rq->pushable_tasks;
        struct task_struct *p;
 
-       if (!has_pushable_tasks(rq))
+       if (!has_pushable_tasks(rt_rq))
                return NULL;
 
        plist_for_each_entry(p, head, pushable_tasks) {
-               if (pick_rt_task(rq, p, cpu))
+               if (pick_rt_task(rt_rq, p, cpu))
                        return p;
        }
 
@@ -1024,14 +1083,15 @@ static struct rq *find_lock_lowest_rq(struct 
task_struct *task, struct rq *rq)
        return lowest_rq;
 }
 
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
+static struct task_struct *pick_next_pushable_task(struct rt_rq *rt_rq)
 {
+       struct rq *rq = rq_of_rt_rq(rt_rq);
        struct task_struct *p;
 
-       if (!has_pushable_tasks(rq))
+       if (!has_pushable_tasks(rt_rq))
                return NULL;
 
-       p = plist_first_entry(&rq->rt.pushable_tasks,
+       p = plist_first_entry(&rt_rq->pushable_tasks,
                              struct task_struct, pushable_tasks);
 
        BUG_ON(rq->cpu != task_cpu(p));
@@ -1058,7 +1118,7 @@ static int push_rt_task(struct rq *rq)
        if (!rq->rt.overloaded)
                return 0;
 
-       next_task = pick_next_pushable_task(rq);
+       next_task = pick_next_pushable_task(&rq->rt);
        if (!next_task)
                return 0;
 
@@ -1093,7 +1153,7 @@ static int push_rt_task(struct rq *rq)
                 * run-queue and is also still the next task eligible for
                 * pushing.
                 */
-               task = pick_next_pushable_task(rq);
+               task = pick_next_pushable_task(&rq->rt);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
                         * The task hasn't migrated, and is still the next
@@ -1331,7 +1391,7 @@ static void try_to_push_tasks(void *arg)
        src_rq = rq_of_rt_rq(rt_rq);
 
 again:
-       if (has_pushable_tasks(rq)) {
+       if (has_pushable_tasks(&rq->rt)) {
                raw_spin_lock(&rq->lock);
                push_rt_task(rq);
                raw_spin_unlock(&rq->lock);
@@ -1382,6 +1442,7 @@ static void pull_rt_task(struct rq *this_rq)
        int this_cpu = this_rq->cpu, cpu;
        bool resched = false;
        struct task_struct *p;
+       struct rt_rq *src_rt_rq;
        struct rq *src_rq;
 
        if (likely(!rt_overloaded(this_rq)))
@@ -1405,6 +1466,7 @@ static void pull_rt_task(struct rq *this_rq)
                        continue;
 
                src_rq = cpu_rq(cpu);
+               src_rt_rq = &src_rq->rt;
 
                /*
                 * Don't bother taking the src_rq->lock if the next highest
@@ -1413,7 +1475,7 @@ static void pull_rt_task(struct rq *this_rq)
                 * logically higher, the src_rq will push this task away.
                 * And if its going logically lower, we do not care
                 */
-               if (src_rq->rt.highest_prio.next >=
+               if (src_rt_rq->highest_prio.next >=
                    this_rq->rt.highest_prio.curr)
                        continue;
 
@@ -1428,7 +1490,7 @@ static void pull_rt_task(struct rq *this_rq)
                 * We can pull only a task, which is pushable
                 * on its rq, and no others.
                 */
-               p = pick_highest_pushable_task(src_rq, this_cpu);
+               p = pick_highest_pushable_task(src_rt_rq, this_cpu);
 
                /*
                 * Do we have an RT task that preempts
@@ -1469,19 +1531,44 @@ static void pull_rt_task(struct rq *this_rq)
                resched_curr(this_rq);
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+int group_push_rt_task(struct rt_rq *rt_rq)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       if (is_dl_group(rt_rq))
+               return 0;
+
+       return push_rt_task(rq);
+}
+
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+       while (group_push_rt_task(rt_rq))
+               ;
+}
+#else
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+       push_rt_tasks(rq_of_rt_rq(rt_rq));
+}
+#endif
+
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
  */
 static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
+       struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            p->nr_cpus_allowed > 1 &&
            (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
-               push_rt_tasks(rq);
+               group_push_rt_tasks(rt_rq);
 }
 
 /* Assumes rq->lock is held */
@@ -1508,6 +1595,8 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
+       struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
        /*
         * If there are other RT tasks then we will reschedule
         * and the scheduling of the other RT tasks will handle
@@ -1515,10 +1604,12 @@ static void switched_from_rt(struct rq *rq, struct 
task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-       if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+       if (!task_on_rq_queued(p) || rt_rq->rt_nr_running)
                return;
 
+#ifndef CONFIG_RT_GROUP_SCHED
        queue_pull_task(rq);
+#endif
 }
 
 void __init init_sched_rt_class(void)
@@ -1548,8 +1639,16 @@ static void switched_to_rt(struct rq *rq, struct 
task_struct *p)
         */
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
+#ifndef CONFIG_RT_GROUP_SCHED
                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
                        queue_push_tasks(rq);
+#else
+               if (rt_rq_of_se(&p->rt)->overloaded) {
+               } else {
+                       if (p->prio < rq->curr->prio)
+                               resched_curr(rq);
+               }
+#endif
 #endif /* CONFIG_SMP */
                if (p->prio < rq->curr->prio)
                        resched_curr(rq);
@@ -1563,6 +1662,10 @@ static void switched_to_rt(struct rq *rq, struct 
task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
+#ifdef CONFIG_SMP
+       struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+#endif
+
        if (!task_on_rq_queued(p))
                return;
 
@@ -1573,13 +1676,14 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, 
int oldprio)
                 * may need to pull tasks to this runqueue.
                 */
                if (oldprio < p->prio)
+#ifndef CONFIG_RT_GROUP_SCHED
                        queue_pull_task(rq);
-
+#endif
                /*
                 * If there's a higher priority task waiting to run
                 * then reschedule.
                 */
-               if (p->prio > rq->rt.highest_prio.curr)
+               if (p->prio > rt_rq->highest_prio.curr)
                        resched_curr(rq);
 #else
                /* For UP simply resched on drop of prio */
@@ -1628,6 +1732,14 @@ static void task_tick_rt(struct rq *rq, struct 
task_struct *p, int queued)
        struct sched_rt_entity *rt_se = &p->rt;
 
        update_curr_rt(rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (is_dl_group(&rq->rt)) {
+               struct sched_dl_entity *dl_se = dl_group_of(&rq->rt);
+
+               if (hrtick_enabled(rq) && queued && dl_se->runtime > 0)
+                       start_hrtick_dl(rq, dl_se);
+       }
+#endif
 
        watchdog(rq, p);
 
@@ -1647,23 +1759,22 @@ static void task_tick_rt(struct rq *rq, struct 
task_struct *p, int queued)
         * Requeue to the end of queue if we (and all of our ancestors) are not
         * the only element on the queue
         */
-       for_each_sched_rt_entity(rt_se) {
-               if (rt_se->run_list.prev != rt_se->run_list.next) {
-                       requeue_task_rt(rq, p, 0);
-                       resched_curr(rq);
-                       return;
-               }
+       if (rt_se->run_list.prev != rt_se->run_list.next) {
+               requeue_task_rt(rq, p, 0);
+               set_tsk_need_resched(p);
+               return;
        }
 }
 
 static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
+       struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
 
        p->se.exec_start = rq_clock_task(rq);
 
        /* The running task is never eligible for pushing */
-       dequeue_pushable_task(rq, p);
+       dequeue_pushable_task(rt_rq, p);
 }
 
 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4c4a18..528b41c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  */
 #define DL_SCALE (10)
 
+unsigned long to_ratio(u64 period, u64 runtime);
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  */
@@ -228,12 +230,6 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
        dl_b->total_bw += tsk_bw;
 }
 
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-       return dl_b->bw != -1 &&
-              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
 
 extern void init_dl_bw(struct dl_bw *dl_b);
 
@@ -286,9 +282,14 @@ struct task_group {
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-       struct sched_rt_entity **rt_se;
+       /*
+        * The scheduling entities for the task group are managed as a single
+        * sched_dl_entity, each of them sharing the same dl_bandwidth.
+        */
+       struct sched_dl_entity **dl_se;
        struct rt_rq **rt_rq;
 
+       struct dl_bandwidth dl_bandwidth;
 #endif
 
        struct rcu_head rcu;
@@ -354,8 +355,8 @@ extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 extern void free_rt_sched_group(struct task_group *tg);
 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group 
*parent);
 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu,
-               struct sched_rt_entity *parent);
+               struct sched_dl_entity *rt_se, int cpu,
+               struct sched_dl_entity *parent);
 
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_online_group(struct task_group *tg,
@@ -383,6 +384,21 @@ struct cfs_bandwidth { };
 
 #endif /* CONFIG_CGROUP_SCHED */
 
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+       u64 dl_groups_root = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       dl_groups_root = to_ratio(root_task_group.dl_bandwidth.dl_period,
+                                 root_task_group.dl_bandwidth.dl_runtime);
+#endif
+       return dl_b->bw != -1 &&
+              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw
+                                       + dl_groups_root * cpus;
+}
+
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
@@ -484,7 +500,6 @@ struct rt_rq {
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
-       unsigned long rt_nr_total;
        int overloaded;
        struct plist_head pushable_tasks;
 #ifdef HAVE_RT_PUSH_IPI
@@ -494,14 +509,11 @@ struct rt_rq {
        raw_spinlock_t push_lock;
 #endif
 #endif /* CONFIG_SMP */
-       int rt_queued;
 
 #ifdef CONFIG_RT_GROUP_SCHED
-       unsigned long rt_nr_boosted;
-
-       struct rq *rq;
        struct task_group *tg;
 #endif
+       struct rq *rq;
 };
 
 /* Deadline class' related fields in a runqueue */
@@ -512,6 +524,12 @@ struct dl_rq {
 
        unsigned long dl_nr_running;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+       unsigned long dl_nr_total;
+       struct rt_rq *rq_to_push_from;
+       struct rt_rq *rq_to_pull_to;
+#endif
+
 #ifdef CONFIG_SMP
        /*
         * Deadline values of the currently executing and the
@@ -1106,7 +1124,8 @@ static inline void set_task_rq(struct task_struct *p, 
unsigned int cpu)
 
 #ifdef CONFIG_RT_GROUP_SCHED
        p->rt.rt_rq  = tg->rt_rq[cpu];
-       p->rt.parent = tg->rt_se[cpu];
+       p->rt.parent = tg->dl_se[cpu];
+       p->dl.dl_rq  = &cpu_rq(cpu)->dl;
 #endif
 }
 
@@ -1461,8 +1480,6 @@ extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 
runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
-unsigned long to_ratio(u64 period, u64 runtime);
-
 extern void init_entity_runnable_average(struct sched_entity *se);
 extern void post_init_entity_util_avg(struct sched_entity *se);
 
@@ -1513,6 +1530,7 @@ static inline void add_nr_running(struct rq *rq, unsigned 
count)
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
+       BUG_ON(rq->nr_running < count);
        rq->nr_running -= count;
        /* Check if we still need preemption */
        sched_update_tick_dependency(rq);
@@ -1864,6 +1882,71 @@ static inline void double_rq_unlock(struct rq *rq1, 
struct rq *rq2)
 
 #endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+       return rt_rq->tg != &root_task_group;
+}
+
+/*
+ * Return the scheduling entity of this group of tasks.
+ */
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+       BUG_ON(!is_dl_group(rt_rq));
+
+       return rt_rq->tg->dl_se[cpu_of(rt_rq->rq)];
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+       return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_se->rt_rq;
+
+       return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       return rt_se->rt_rq;
+}
+#else
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+       return 0;
+}
+
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+       return NULL;
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+       return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+       struct task_struct *p = rt_task_of(rt_se);
+
+       return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       struct rq *rq = rq_of_rt_se(rt_se);
+
+       return &rq->rt;
+}
+#endif
+
+
+
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 
@@ -1986,3 +2069,28 @@ static inline void cpufreq_update_this_cpu(struct rq 
*rq, unsigned int flags) {}
 static inline void arch_task_migrate(struct task_struct *p) { }
 #endif
 
+int group_pull_rt_task(struct rt_rq *rt_rq);
+int group_push_rt_task(struct rt_rq *rt_rq);
+
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq 
*rt_rq);
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_SMP)
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p);
+#else
+static inline
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
+{
+}
+#endif
+
+int is_dl_group(struct rt_rq *rt_rq);
+
+void dequeue_dl_entity(struct sched_dl_entity *dl_se);
+
+void init_dl_timer(struct sched_dl_entity *dl_se);
+
+void enqueue_dl_entity(struct sched_dl_entity *dl_se,
+                             struct sched_dl_entity *pi_se, int flags);
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se);
+int start_dl_timer(struct sched_dl_entity *dl_se);
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se);
+
-- 
2.7.4

[RFC PATCH 2/3] sched/deadline: Hierarchical scheduling with DL on top of RT

Reply via email to