This patch introduces the core changes in CFS required to accomplish
group fairness at higher levels. It also modifies load balance interface
between classes a bit, so that move_tasks (which is centric to load
balance) can be reused to balance between runqueues of various types
(struct rq in case of SCHED_RT tasks, struct lrq in case of
SCHED_NORMAL/BATCH tasks).

Signed-off-by : Srivatsa Vaddagiri <[EMAIL PROTECTED]>

---
 include/linux/sched.h   |   17 ++
 kernel/sched.c          |  174 ++++++++++++++++------------
 kernel/sched_debug.c    |   31 +++--
 kernel/sched_fair.c     |  295 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_idletask.c |   11 +
 kernel/sched_rt.c       |   47 ++++++-
 6 files changed, 464 insertions(+), 111 deletions(-)

Index: current/include/linux/sched.h
===================================================================
--- current.orig/include/linux/sched.h
+++ current/include/linux/sched.h
@@ -134,8 +134,11 @@ extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
+struct cfs_rq;
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -865,8 +868,13 @@ struct sched_class {
        struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 
-       struct task_struct * (*load_balance_start) (struct rq *rq);
-       struct task_struct * (*load_balance_next) (struct rq *rq);
+       int (*load_balance) (struct rq *this_rq, int this_cpu,
+                       struct rq *busiest,
+                       unsigned long max_nr_move, unsigned long max_load_move,
+                       struct sched_domain *sd, enum cpu_idle_type idle,
+                       int *all_pinned, unsigned long *total_load_moved);
+
+       void (*set_curr_task) (struct rq *rq);
        void (*task_tick) (struct rq *rq, struct task_struct *p);
        void (*task_new) (struct rq *rq, struct task_struct *p);
 };
@@ -899,6 +907,11 @@ struct sched_entity {
        s64 fair_key;
        s64 sum_wait_runtime, sum_sleep_runtime;
        unsigned long wait_runtime_overruns, wait_runtime_underruns;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       struct sched_entity *parent;
+       struct cfs_rq *cfs_rq,  /* rq on which this entity is (to be) queued */
+                     *my_q;    /* rq "owned" by this entity/group */
+#endif
 };
 
 struct task_struct {
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -133,6 +133,22 @@ struct cfs_rq {
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
        struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /* 'curr' points to currently running entity on this cfs_rq.
+        * It is set to NULL otherwise (i.e when none are currently running).
+        */
+       struct sched_entity *curr;
+       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+
+       /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+        * (like users, containers etc.)
+        *
+        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+        * list is used during load balance.
+        */
+       struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -168,6 +184,9 @@ struct rq {
        u64 nr_switches;
 
        struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
+#endif
        struct rt_rq  rt;
 
        /*
@@ -342,6 +361,16 @@ static inline unsigned long long rq_cloc
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+       p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)     do { } while (0)
 #endif
@@ -738,6 +767,21 @@ static inline void dec_nr_running(struct
 }
 
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+#ifdef CONFIG_SMP
+
+struct rq_iterator {
+       void *arg;
+       struct task_struct *(*start)(void *);
+       struct task_struct *(*next)(void *);
+};
+
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                     unsigned long max_nr_move, unsigned long max_load_move,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned, unsigned long *load_moved,
+                     int this_best_prio, int best_prio, int best_prio_seen,
+                     struct rq_iterator *iterator);
+#endif
 
 #include "sched_stats.h"
 #include "sched_rt.c"
@@ -894,6 +938,7 @@ unsigned long weighted_cpuload(const int
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
        task_thread_info(p)->cpu = cpu;
+       set_task_cfs_rq(p);
 }
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -919,6 +964,7 @@ void set_task_cpu(struct task_struct *p,
 
        task_thread_info(p)->cpu = new_cpu;
 
+       set_task_cfs_rq(p);
 }
 
 struct migration_req {
@@ -2003,89 +2049,26 @@ int can_migrate_task(struct task_struct 
        return 1;
 }
 
-/*
- * Load-balancing iterator: iterate through the hieararchy of scheduling
- * classes, starting with the highest-prio one:
- */
-
-struct task_struct * load_balance_start(struct rq *rq)
-{
-       struct sched_class *class = sched_class_highest;
-       struct task_struct *p;
-
-       do {
-               p = class->load_balance_start(rq);
-               if (p) {
-                       rq->load_balance_class = class;
-                       return p;
-               }
-               class = class->next;
-       } while (class);
-
-       return NULL;
-}
-
-struct task_struct * load_balance_next(struct rq *rq)
-{
-       struct sched_class *class = rq->load_balance_class;
-       struct task_struct *p;
-
-       p = class->load_balance_next(rq);
-       if (p)
-               return p;
-       /*
-        * Pick up the next class (if any) and attempt to start
-        * the iterator there:
-        */
-       while ((class = class->next)) {
-               p = class->load_balance_start(rq);
-               if (p) {
-                       rq->load_balance_class = class;
-                       return p;
-               }
-       }
-       return NULL;
-}
-
-#define rq_best_prio(rq) (rq)->curr->prio
-
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_nr_move, unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
+                     int *all_pinned, unsigned long *load_moved,
+                     int this_best_prio, int best_prio, int best_prio_seen,
+                     struct rq_iterator *iterator)
 {
-       int pulled = 0, pinned = 0, this_best_prio, best_prio,
-           best_prio_seen, skip_for_load;
+       int pulled = 0, pinned = 0, skip_for_load;
        struct task_struct *p;
-       long rem_load_move;
+       long rem_load_move = max_load_move;
 
        if (max_nr_move == 0 || max_load_move == 0)
                goto out;
 
-       rem_load_move = max_load_move;
        pinned = 1;
-       this_best_prio = rq_best_prio(this_rq);
-       best_prio = rq_best_prio(busiest);
-       /*
-        * Enable handling of the case where there is more than one task
-        * with the best priority.   If the current running task is one
-        * of those with prio==best_prio we know it won't be moved
-        * and therefore it's safe to override the skip (based on load) of
-        * any task we find with that prio.
-        */
-       best_prio_seen = best_prio == busiest->curr->prio;
 
        /*
         * Start the load-balancing iterator:
         */
-       p = load_balance_start(busiest);
+       p = iterator->start(iterator->arg);
 next:
        if (!p)
                goto out;
@@ -2102,7 +2085,7 @@ next:
            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 
                best_prio_seen |= p->prio == best_prio;
-               p = load_balance_next(busiest);
+               p = iterator->next(iterator->arg);
                goto next;
        }
 
@@ -2117,7 +2100,7 @@ next:
        if (pulled < max_nr_move && rem_load_move > 0) {
                if (p->prio < this_best_prio)
                        this_best_prio = p->prio;
-               p = load_balance_next(busiest);
+               p = iterator->next(iterator->arg);
                goto next;
        }
 out:
@@ -2130,10 +2113,40 @@ out:
 
        if (all_pinned)
                *all_pinned = pinned;
+       *load_moved = max_load_move - rem_load_move;
        return pulled;
 }
 
 /*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                     unsigned long max_nr_move, unsigned long max_load_move,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned)
+{
+       struct sched_class *class = sched_class_highest;
+       unsigned long load_moved, total_nr_moved = 0, nr_moved;
+       long rem_load_move = max_load_move;
+
+       do {
+               nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+                               max_nr_move, (unsigned long)rem_load_move,
+                               sd, idle, all_pinned, &load_moved);
+               total_nr_moved += nr_moved;
+               max_nr_move -= nr_moved;
+               rem_load_move -= load_moved;
+               class = class->next;
+       } while (class && max_nr_move && rem_load_move > 0);
+
+       return total_nr_moved;
+}
+
+/*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
@@ -6184,6 +6197,15 @@ int in_sched_functions(unsigned long add
                && addr < (unsigned long)__sched_text_end);
 }
 
+static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+{
+       cfs_rq->tasks_timeline = RB_ROOT;
+       cfs_rq->fair_clock = 1;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->rq = rq;
+#endif
+}
+
 void __init sched_init(void)
 {
        int highest_cpu = 0;
@@ -6204,10 +6226,14 @@ void __init sched_init(void)
                spin_lock_init(&rq->lock);
                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
-               rq->cfs.tasks_timeline = RB_ROOT;
-               rq->clock = rq->cfs.fair_clock = 1;
+               rq->clock = 1;
                rq->ls.load_update_last = sched_clock();
                rq->ls.load_update_start = sched_clock();
+               init_cfs_rq(&rq->cfs, rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+               list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+#endif
 
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                        rq->cpu_load[j] = 0;
Index: current/kernel/sched_debug.c
===================================================================
--- current.orig/kernel/sched_debug.c
+++ current/kernel/sched_debug.c
@@ -80,15 +80,17 @@ static void print_rq(struct seq_file *m,
        read_unlock_irq(&tasklist_lock);
 }
 
-static void print_rq_runtime_sum(struct seq_file *m, struct rq *rq)
+static void
+print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 wait_runtime_rq_sum = 0;
        struct task_struct *p;
        struct rb_node *curr;
        unsigned long flags;
+       struct rq *rq = &per_cpu(runqueues, cpu);
 
        spin_lock_irqsave(&rq->lock, flags);
-       curr = first_fair(&rq->cfs);
+       curr = first_fair(cfs_rq);
        while (curr) {
                p = rb_entry(curr, struct task_struct, se.run_node);
                wait_runtime_rq_sum += p->se.wait_runtime;
@@ -101,6 +103,23 @@ static void print_rq_runtime_sum(struct 
                (long long)wait_runtime_rq_sum);
 }
 
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+       SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
+
+#define P(x) \
+       SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
+
+       P(fair_clock);
+       P(exec_clock);
+       P(wait_runtime);
+       P(wait_runtime_overruns);
+       P(wait_runtime_underruns);
+#undef P
+
+       print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+}
+
 static void print_cpu(struct seq_file *m, int cpu, u64 now)
 {
        struct rq *rq = &per_cpu(runqueues, cpu);
@@ -136,18 +155,14 @@ static void print_cpu(struct seq_file *m
        P(clock_overflows);
        P(clock_unstable_events);
        P(clock_max_delta);
-       P(cfs.fair_clock);
-       P(cfs.exec_clock);
-       P(cfs.wait_runtime);
-       P(cfs.wait_runtime_overruns);
-       P(cfs.wait_runtime_underruns);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
        P(cpu_load[3]);
        P(cpu_load[4]);
 #undef P
-       print_rq_runtime_sum(m, rq);
+
+       print_cfs_stats(m, cpu, now);
 
        print_rq(m, rq, cpu, now);
 }
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c
+++ current/kernel/sched_fair.c
@@ -70,6 +70,31 @@ extern struct sched_class fair_sched_cla
  * CFS operations on generic schedulable entities:
  */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->rq;
+}
+
+/* currently running entity (if any) on this cfs_rq */
+static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->curr;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)     (!se->my_q)
+
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       cfs_rq->curr = se;
+}
+
+#else  /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
        return container_of(cfs_rq, struct rq, cfs);
@@ -87,6 +112,11 @@ static inline struct sched_entity *cfs_r
 
 #define entity_is_task(se)     1
 
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
        return container_of(se, struct task_struct, se);
@@ -588,10 +618,9 @@ __check_preempt_curr_fair(struct cfs_rq 
                resched_task(rq_of(cfs_rq)->curr);
 }
 
-static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+static inline void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-       struct sched_entity *se = __pick_next_entity(cfs_rq);
-
        /*
         * Any task has to be enqueued before it get to execute on
         * a CPU. So account for the time it spent waiting on the
@@ -601,6 +630,14 @@ static struct sched_entity * pick_next_e
         */
        update_stats_wait_end(cfs_rq, se, now);
        update_stats_curr_start(cfs_rq, se, now);
+       set_cfs_rq_curr(cfs_rq, se);
+}
+
+static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+{
+       struct sched_entity *se = __pick_next_entity(cfs_rq);
+
+       set_next_entity(cfs_rq, se, now);
 
        return se;
 }
@@ -638,6 +675,7 @@ put_prev_entity(struct cfs_rq *cfs_rq, s
 
        if (prev->on_rq)
                update_stats_wait_start(cfs_rq, prev, now);
+       set_cfs_rq_curr(cfs_rq, NULL);
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
@@ -677,11 +715,90 @@ static void entity_tick(struct cfs_rq *c
  * CFS operations on tasks:
  */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+               for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+       return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+       return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+       return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+       /* A later patch will take group into account */
+       return &cpu_rq(this_cpu)->cfs;
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+       list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) tasks belong to the same group ? */
+static inline int is_same_group(struct task_struct *curr, struct task_struct 
*p)
+{
+       if (curr->se.cfs_rq == p->se.cfs_rq)
+               return 1;
+
+       return 0;
+}
+
+#else  /* CONFIG_FAIR_GROUP_SCHED */
+
+#define for_each_sched_entity(se) \
+               for (; se; se = NULL)
+
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
        return &task_rq(p)->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+       struct task_struct *p = task_of(se);
+       struct rq *rq = task_rq(p);
+
+       return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+       return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+       return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int is_same_group(struct task_struct *curr, struct task_struct 
*p)
+{
+       return 1;
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -690,10 +807,15 @@ static inline struct cfs_rq *task_cfs_rq
 static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
-       struct cfs_rq *cfs_rq = task_cfs_rq(p);
+       struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
 
-       enqueue_entity(cfs_rq, se, wakeup, now);
+       for_each_sched_entity(se) {
+               if (se->on_rq)
+                       break;
+               cfs_rq = cfs_rq_of(se);
+               enqueue_entity(cfs_rq, se, wakeup, now);
+       }
 }
 
 /*
@@ -704,10 +826,16 @@ enqueue_task_fair(struct rq *rq, struct 
 static void
 dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
-       struct cfs_rq *cfs_rq = task_cfs_rq(p);
+       struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
 
-       dequeue_entity(cfs_rq, se, sleep, now);
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               dequeue_entity(cfs_rq, se, sleep, now);
+               /* Don't dequeue parent if it has other entities besides us */
+               if (cfs_rq->load.weight)
+                       break;
+       }
 }
 
 /*
@@ -755,7 +883,8 @@ static void check_preempt_curr_fair(stru
        if (unlikely(p->policy == SCHED_BATCH))
                gran = sysctl_sched_batch_wakeup_granularity;
 
-       __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+       if (is_same_group(curr, p))
+               __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
 }
 
 static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now)
@@ -766,7 +895,10 @@ static struct task_struct * pick_next_ta
        if (unlikely(!cfs_rq->nr_running))
                return NULL;
 
-       se = pick_next_entity(cfs_rq, now);
+       do {
+               se = pick_next_entity(cfs_rq, now);
+               cfs_rq = group_cfs_rq(se);
+       } while (cfs_rq);
 
        return task_of(se);
 }
@@ -776,7 +908,13 @@ static struct task_struct * pick_next_ta
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 
now)
 {
-       put_prev_entity(task_cfs_rq(prev), &prev->se, now);
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &prev->se;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               put_prev_entity(cfs_rq, se, now);
+       }
 }
 
 /**************************************************
@@ -791,7 +929,7 @@ static void put_prev_task_fair(struct rq
  * the current task:
  */
 static inline struct task_struct *
-__load_balance_iterator(struct rq *rq, struct rb_node *curr)
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
        struct task_struct *p;
 
@@ -799,19 +937,104 @@ __load_balance_iterator(struct rq *rq, s
                return NULL;
 
        p = rb_entry(curr, struct task_struct, se.run_node);
-       rq->cfs.rb_load_balance_curr = rb_next(curr);
+       cfs_rq->rb_load_balance_curr = rb_next(curr);
 
        return p;
 }
 
-static struct task_struct * load_balance_start_fair(struct rq *rq)
+static struct task_struct * load_balance_start_fair(void *arg)
 {
-       return __load_balance_iterator(rq, first_fair(&rq->cfs));
+       struct cfs_rq *cfs_rq = arg;
+
+       return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 }
 
-static struct task_struct * load_balance_next_fair(struct rq *rq)
+static struct task_struct * load_balance_next_fair(void *arg)
 {
-       return __load_balance_iterator(rq, rq->cfs.rb_load_balance_curr);
+       struct cfs_rq *cfs_rq = arg;
+
+       return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+}
+
+static inline int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *curr;
+       struct task_struct *p;
+
+       if (!cfs_rq->nr_running)
+               return MAX_PRIO;
+
+       curr = __pick_next_entity(cfs_rq);
+       p = task_of(curr);
+
+       return p->prio;
+}
+
+static int
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                       unsigned long max_nr_move, unsigned long max_load_move,
+                       struct sched_domain *sd, enum cpu_idle_type idle,
+                       int *all_pinned, unsigned long *total_load_moved)
+{
+       struct cfs_rq *busy_cfs_rq;
+       unsigned long load_moved, total_nr_moved = 0, nr_moved;
+       long rem_load_move = max_load_move;
+       struct rq_iterator cfs_rq_iterator;
+
+       cfs_rq_iterator.start = load_balance_start_fair;
+       cfs_rq_iterator.next = load_balance_next_fair;
+
+       for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+               struct cfs_rq *this_cfs_rq;
+               long imbalance;
+               unsigned long maxload;
+               int this_best_prio, best_prio, best_prio_seen = 0;
+
+               this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+
+               imbalance = busy_cfs_rq->load.weight -
+                                                this_cfs_rq->load.weight;
+               /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+               if (imbalance <= 0)
+                       continue;
+
+               /* Don't pull more than imbalance/2 */
+               imbalance /= 2;
+               maxload = min(rem_load_move, imbalance);
+
+               this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+               best_prio = cfs_rq_best_prio(busy_cfs_rq);
+
+               /*
+                * Enable handling of the case where there is more than one task
+                * with the best priority. If the current running task is one
+                * of those with prio==best_prio we know it won't be moved
+                * and therefore it's safe to override the skip (based on load)
+                * of any task we find with that prio.
+                */
+               if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
+                       best_prio_seen = 1;
+
+               /* pass busy_cfs_rq argument into
+                * load_balance_[start|next]_fair iterators
+                */
+               cfs_rq_iterator.arg = busy_cfs_rq;
+               nr_moved = balance_tasks(this_rq, this_cpu, busiest,
+                               max_nr_move, maxload, sd, idle, all_pinned,
+                               &load_moved, this_best_prio, best_prio,
+                               best_prio_seen, &cfs_rq_iterator);
+
+               total_nr_moved += nr_moved;
+               max_nr_move -= nr_moved;
+               rem_load_move -= load_moved;
+
+               if (max_nr_move <= 0 || rem_load_move <= 0)
+                       break;
+       }
+
+       *total_load_moved = max_load_move - rem_load_move;
+
+       return total_nr_moved;
 }
 
 /*
@@ -819,7 +1042,13 @@ static struct task_struct * load_balance
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 {
-       entity_tick(task_cfs_rq(curr), &curr->se);
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &curr->se;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               entity_tick(cfs_rq, se);
+       }
 }
 
 /*
@@ -863,6 +1092,24 @@ static void task_new_fair(struct rq *rq,
        inc_nr_running(p, rq, now);
 }
 
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+       struct sched_entity *se = &curr->se;
+       struct cfs_rq *cfs_rq;
+       u64 now = rq_clock(rq);
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               set_next_entity(cfs_rq, se, now);
+       }
+}
+
 /*
  * All the scheduling class methods:
  */
@@ -876,8 +1123,18 @@ struct sched_class fair_sched_class __re
        .pick_next_task         = pick_next_task_fair,
        .put_prev_task          = put_prev_task_fair,
 
-       .load_balance_start     = load_balance_start_fair,
-       .load_balance_next      = load_balance_next_fair,
+       .load_balance           = load_balance_fair,
+
+       .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
        .task_new               = task_new_fair,
 };
+
+void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct cfs_rq *cfs_rq;
+
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               print_cfs_rq(m, cpu, cfs_rq, now);
+}
Index: current/kernel/sched_idletask.c
===================================================================
--- current.orig/kernel/sched_idletask.c
+++ current/kernel/sched_idletask.c
@@ -37,9 +37,13 @@ static void put_prev_task_idle(struct rq
 {
 }
 
-static struct task_struct *load_balance_start_idle(struct rq *rq)
+static int
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                       unsigned long max_nr_move, unsigned long max_load_move,
+                       struct sched_domain *sd, enum cpu_idle_type idle,
+                       int *all_pinned, unsigned long *total_load_moved)
 {
-       return NULL;
+       return 0;
 }
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr)
@@ -60,8 +64,7 @@ struct sched_class idle_sched_class __re
        .pick_next_task         = pick_next_task_idle,
        .put_prev_task          = put_prev_task_idle,
 
-       .load_balance_start     = load_balance_start_idle,
-       /* no .load_balance_next for idle tasks */
+       .load_balance           = load_balance_idle,
 
        .task_tick              = task_tick_idle,
        /* no .task_new for idle tasks */
Index: current/kernel/sched_rt.c
===================================================================
--- current.orig/kernel/sched_rt.c
+++ current/kernel/sched_rt.c
@@ -107,8 +107,9 @@ static void put_prev_task_rt(struct rq *
  * achieve that by always pre-iterating before returning
  * the current task:
  */
-static struct task_struct * load_balance_start_rt(struct rq *rq)
+static struct task_struct * load_balance_start_rt(void *arg)
 {
+       struct rq *rq = arg;
        struct prio_array *array = &rq->rt.active;
        struct list_head *head, *curr;
        struct task_struct *p;
@@ -132,8 +133,9 @@ static struct task_struct * load_balance
        return p;
 }
 
-static struct task_struct * load_balance_next_rt(struct rq *rq)
+static struct task_struct * load_balance_next_rt(void *arg)
 {
+       struct rq *rq = arg;
        struct prio_array *array = &rq->rt.active;
        struct list_head *head, *curr;
        struct task_struct *p;
@@ -170,6 +172,44 @@ static struct task_struct * load_balance
        return p;
 }
 
+static int
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                       unsigned long max_nr_move, unsigned long max_load_move,
+                       struct sched_domain *sd, enum cpu_idle_type idle,
+                       int *all_pinned, unsigned long *load_moved)
+{
+       int this_best_prio, best_prio, best_prio_seen = 0;
+       int nr_moved;
+       struct rq_iterator rt_rq_iterator;
+
+       best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
+       this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
+
+       /*
+        * Enable handling of the case where there is more than one task
+        * with the best priority.   If the current running task is one
+        * of those with prio==best_prio we know it won't be moved
+        * and therefore it's safe to override the skip (based on load)
+        * of any task we find with that prio.
+        */
+       if (busiest->curr->prio == best_prio)
+               best_prio_seen = 1;
+
+       rt_rq_iterator.start = load_balance_start_rt;
+       rt_rq_iterator.next = load_balance_next_rt;
+       /* pass 'busiest' rq argument into
+        * load_balance_[start|next]_rt iterators
+        */
+       rt_rq_iterator.arg = busiest;
+
+       nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
+                       max_load_move, sd, idle, all_pinned, load_moved,
+                       this_best_prio, best_prio, best_prio_seen,
+                       &rt_rq_iterator);
+
+       return nr_moved;
+}
+
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
        /*
@@ -207,8 +247,7 @@ static struct sched_class rt_sched_class
        .pick_next_task         = pick_next_task_rt,
        .put_prev_task          = put_prev_task_rt,
 
-       .load_balance_start     = load_balance_start_rt,
-       .load_balance_next      = load_balance_next_rt,
+       .load_balance           = load_balance_rt,
 
        .task_tick              = task_tick_rt,
        .task_new               = task_new_rt,
-- 
Regards,
vatsa

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
ckrm-tech mailing list
https://lists.sourceforge.net/lists/listinfo/ckrm-tech

Reply via email to