sched: add RT task pushing

Linux Kernel Mailing List Fri, 25 Jan 2008 15:04:19 -0800

Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e8fa136262e1121288bb93befe2295928ffd240d
Commit:     e8fa136262e1121288bb93befe2295928ffd240d
Parent:     764a9d6fe4b52995c8aba277e3634385699354f4
Author:     Steven Rostedt <[EMAIL PROTECTED]>
AuthorDate: Fri Jan 25 21:08:05 2008 +0100
Committer:  Ingo Molnar <[EMAIL PROTECTED]>
CommitDate: Fri Jan 25 21:08:05 2008 +0100


    sched: add RT task pushing
    
    This patch adds an algorithm to push extra RT tasks off a run queue to
    other CPU runqueues.
    
    When more than one RT task is added to a run queue, this algorithm takes
    an assertive approach to push the RT tasks that are not running onto other
    run queues that have lower priority.  The way this works is that the highest
    RT task that is not running is looked at and we examine the runqueues on
    the CPUS for that tasks affinity mask. We find the runqueue with the lowest
    prio in the CPU affinity of the picked task, and if it is lower in prio than
    the picked task, we push the task onto that CPU runqueue.
    
    We continue pushing RT tasks off the current runqueue until we don't push 
any
    more.  The algorithm stops when the next highest RT task can't preempt any
    other processes on other CPUS.
    
    TODO: The algorithm may stop when there are still RT tasks that can be
     migrated. Specifically, if the highest non running RT task CPU affinity
     is restricted to CPUs that are running higher priority tasks, there may
     be a lower priority task queued that has an affinity with a CPU that is
     running a lower priority task that it could be migrated to.  This
     patch set does not address this issue.
    
    Note: checkpatch reveals two over 80 character instances. I'm not sure
     that breaking them up will help visually, so I left them as is.
    
    Signed-off-by: Steven Rostedt <[EMAIL PROTECTED]>
    Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
---
 kernel/sched.c    |    8 ++-
 kernel/sched_rt.c |  225 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 231 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa0..97cab60 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct 
task_struct *prev)
        prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
+       schedule_tail_balance_rt(rq);
+
        fire_sched_in_preempt_notifiers(current);
        if (mm)
                mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq 
*rq2)
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
+       int ret = 0;
+
        if (unlikely(!irqs_disabled())) {
                /* printk() doesn't work good under rq->lock */
                spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, 
struct rq *busiest)
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
+                       ret = 1;
                } else
                        spin_lock(&busiest->lock);
        }
+       return ret;
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c285..7815e90 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct 
task_struct *p)
 }
 
 #ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+{
+       struct rt_prio_array *array = &rq->rt.active;
+       struct task_struct *next;
+       struct list_head *queue;
+       int idx;
+
+       assert_spin_locked(&rq->lock);
+
+       if (likely(rq->rt.rt_nr_running < 2))
+               return NULL;
+
+       idx = sched_find_first_bit(array->bitmap);
+       if (unlikely(idx >= MAX_RT_PRIO)) {
+               WARN_ON(1); /* rt_nr_running is bad */
+               return NULL;
+       }
+
+       queue = array->queue + idx;
+       next = list_entry(queue->next, struct task_struct, run_list);
+       if (unlikely(next != rq->curr))
+               return next;
+
+       if (queue->next->next != queue) {
+               /* same prio task */
+               next = list_entry(queue->next->next, struct task_struct, 
run_list);
+               return next;
+       }
+
+       /* slower, but more flexible */
+       idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+       if (unlikely(idx >= MAX_RT_PRIO)) {
+               WARN_ON(1); /* rt_nr_running was 2 and above! */
+               return NULL;
+       }
+
+       queue = array->queue + idx;
+       next = list_entry(queue->next, struct task_struct, run_list);
+
+       return next;
+}
+
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+                                     struct rq *this_rq)
+{
+       struct rq *lowest_rq = NULL;
+       int cpu;
+       int tries;
+       cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+
+       cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+
+       for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+               /*
+                * Scan each rq for the lowest prio.
+                */
+               for_each_cpu_mask(cpu, *cpu_mask) {
+                       struct rq *rq = &per_cpu(runqueues, cpu);
+
+                       if (cpu == this_rq->cpu)
+                               continue;
+
+                       /* We look for lowest RT prio or non-rt CPU */
+                       if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+                               lowest_rq = rq;
+                               break;
+                       }
+
+                       /* no locking for now */
+                       if (rq->rt.highest_prio > task->prio &&
+                           (!lowest_rq || rq->rt.highest_prio > 
lowest_rq->rt.highest_prio)) {
+                               lowest_rq = rq;
+                       }
+               }
+
+               if (!lowest_rq)
+                       break;
+
+               /* if the prio of this runqueue changed, try again */
+               if (double_lock_balance(this_rq, lowest_rq)) {
+                       /*
+                        * We had to unlock the run queue. In
+                        * the mean time, task could have
+                        * migrated already or had its affinity changed.
+                        * Also make sure that it wasn't scheduled on its rq.
+                        */
+                       if (unlikely(task_rq(task) != this_rq ||
+                                    !cpu_isset(lowest_rq->cpu, 
task->cpus_allowed) ||
+                                    task_running(this_rq, task) ||
+                                    !task->se.on_rq)) {
+                               spin_unlock(&lowest_rq->lock);
+                               lowest_rq = NULL;
+                               break;
+                       }
+               }
+
+               /* If this rq is still suitable use it. */
+               if (lowest_rq->rt.highest_prio > task->prio)
+                       break;
+
+               /* try again */
+               spin_unlock(&lowest_rq->lock);
+               lowest_rq = NULL;
+       }
+
+       return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+       struct task_struct *next_task;
+       struct rq *lowest_rq;
+       int ret = 0;
+       int paranoid = RT_MAX_TRIES;
+
+       assert_spin_locked(&this_rq->lock);
+
+       next_task = pick_next_highest_task_rt(this_rq);
+       if (!next_task)
+               return 0;
+
+ retry:
+       if (unlikely(next_task == this_rq->curr))
+               return 0;
+
+       /*
+        * It's possible that the next_task slipped in of
+        * higher priority than current. If that's the case
+        * just reschedule current.
+        */
+       if (unlikely(next_task->prio < this_rq->curr->prio)) {
+               resched_task(this_rq->curr);
+               return 0;
+       }
+
+       /* We might release this_rq lock */
+       get_task_struct(next_task);
+
+       /* find_lock_lowest_rq locks the rq if found */
+       lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+       if (!lowest_rq) {
+               struct task_struct *task;
+               /*
+                * find lock_lowest_rq releases this_rq->lock
+                * so it is possible that next_task has changed.
+                * If it has, then try again.
+                */
+               task = pick_next_highest_task_rt(this_rq);
+               if (unlikely(task != next_task) && task && paranoid--) {
+                       put_task_struct(next_task);
+                       next_task = task;
+                       goto retry;
+               }
+               goto out;
+       }
+
+       assert_spin_locked(&lowest_rq->lock);
+
+       deactivate_task(this_rq, next_task, 0);
+       set_task_cpu(next_task, lowest_rq->cpu);
+       activate_task(lowest_rq, next_task, 0);
+
+       resched_task(lowest_rq->curr);
+
+       spin_unlock(&lowest_rq->lock);
+
+       ret = 1;
+out:
+       put_task_struct(next_task);
+
+       return ret;
+}
+
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+       /* push_rt_task will return true if it moved an RT */
+       while (push_rt_task(rq))
+               ;
+}
+
+static void schedule_tail_balance_rt(struct rq *rq)
+{
+       /*
+        * If we have more than one rt_task queued, then
+        * see if we can push the other rt_tasks off to other CPUS.
+        * Note we may release the rq lock, and since
+        * the lock was owned by prev, we need to release it
+        * first via finish_lock_switch and then reaquire it here.
+        */
+       if (unlikely(rq->rt.rt_nr_running > 1)) {
+               spin_lock_irq(&rq->lock);
+               push_rt_tasks(rq);
+               spin_unlock_irq(&rq->lock);
+       }
+}
+
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct 
rq *busiest,
        return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
                                  &rt_rq_iterator);
 }
-#endif
+#else /* CONFIG_SMP */
+# define schedule_tail_balance_rt(rq)  do { } while (0)
+#endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

sched: add RT task pushing

Reply via email to