The patch titled
CFS scheduler, -v16
has been added to the -mm tree. Its filename is
cfs-scheduler-v16.patch
*** Remember to use Documentation/SubmitChecklist when testing your code ***
See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
out what to do about this
------------------------------------------------------
Subject: CFS scheduler, -v16
From: Ingo Molnar <[EMAIL PROTECTED]>
-v16 includes smaller fixes. Continued work on precise /proc CPU
accounting of both SCHED_OTHER and RT tasks by Dmitry Adamushko and
Balbir Singh. Reniced tasks should now disturb nice-0 tasks even less.
Also, i have changed SCHED_BATCH back to its current mainline meaning
and have added a SCHED_IDLEPRIO instead (first introduced by Con Kolivas
in staircase/RSDL/SD).
Changes since -v15:
- more /proc CPU stats accounting improvements (Dmitry Adamushko,
Balbir Singh)
- fix SCHED_BATCH (reported by Con Kolivas)
- update_load_fair() - use 64-bit arithmetics (Dmitry Adamushko)
- fix RT->NORMAL accounting issue raised by Srivatsa Vaddagiri: have
correct exec_start stamping. (Dmitry Adamushko)
- check for negative deltas in task_sched_runtime() (Dmitry Adamushko)
- check for large forward-jumping sched_clock()
- cleanup: remove task_struct :: last_ran (Dmitry Adamushko)
- /proc/sched_debug printk fixes (Andrew Morton)
- add SCHED_IDLEPRIO
- consolidate the granularity settings and make them scale together
- improve /proc/sched_debug output
- remove the yield workarounds - the default seems to be working now.
- introduce lower and upper limits for the granularity tunables.
Setting them to zero accidentally broke nice levels.
- various small fixes/cleanups
Signed-off-by: Ingo Molnar <[EMAIL PROTECTED]>
Signed-off-by: Dmitry Adamushko <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---
drivers/char/drm/radeon_cp.c | 5 -
fs/proc/array.c | 32 ++++--
include/linux/sched.h | 10 +-
kernel/posix-cpu-timers.c | 2
kernel/sched.c | 127 +++++++++++++++-----------
kernel/sched_debug.c | 38 +++++--
kernel/sched_fair.c | 160 +++++++++++++--------------------
kernel/sched_rt.c | 36 ++++++-
kernel/sysctl.c | 34 +++----
9 files changed, 247 insertions(+), 197 deletions(-)
diff -puN drivers/char/drm/radeon_cp.c~cfs-scheduler-v16
drivers/char/drm/radeon_cp.c
--- a/drivers/char/drm/radeon_cp.c~cfs-scheduler-v16
+++ a/drivers/char/drm/radeon_cp.c
@@ -2267,11 +2267,6 @@ int radeon_driver_load(struct drm_device
DRM_DEBUG("%s card detected\n",
((dev_priv->flags & RADEON_IS_AGP) ? "AGP" :
(((dev_priv->flags & RADEON_IS_PCIE) ? "PCIE" : "PCI"))));
- if (sysctl_sched_yield_bug_workaround == -1) {
- sysctl_sched_yield_bug_workaround = 1;
- printk(KERN_WARNING "quirk installed: turning on "
- "sys_sched_yield() workaround for Radeon DRM.\n");
- }
return ret;
}
diff -puN fs/proc/array.c~cfs-scheduler-v16 fs/proc/array.c
--- a/fs/proc/array.c~cfs-scheduler-v16
+++ a/fs/proc/array.c
@@ -172,8 +172,8 @@ static inline char * task_state(struct t
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
- p->tgid, p->pid,
- pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ p->tgid, p->pid,
+ pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
p->uid, p->euid, p->suid, p->fsuid,
p->gid, p->egid, p->sgid, p->fsgid);
@@ -322,24 +322,38 @@ int proc_pid_status(struct task_struct *
static clock_t task_utime(struct task_struct *p)
{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+
/*
* Use CFS's precise accounting, if available:
*/
- if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
- return nsec_to_clock_t(p->sum_exec_runtime);
+ if (!(sysctl_sched_features & 128)) {
+ u64 temp = (u64)nsec_to_clock_t(p->sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+ }
- return cputime_to_clock_t(p->utime);
+ return utime;
}
static clock_t task_stime(struct task_struct *p)
{
+ clock_t stime = cputime_to_clock_t(p->stime);
+
/*
- * Use CFS's precise accounting, if available:
+ * Use CFS's precise accounting, if available (we subtract
+ * utime from the total, to make sure the total observed
+ * by userspace grows monotonically - apps rely on that):
*/
- if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
- return 0;
+ if (!(sysctl_sched_features & 128))
+ stime = nsec_to_clock_t(p->sum_exec_runtime) - task_utime(p);
- return cputime_to_clock_t(p->stime);
+ return stime;
}
diff -puN include/linux/sched.h~cfs-scheduler-v16 include/linux/sched.h
--- a/include/linux/sched.h~cfs-scheduler-v16
+++ a/include/linux/sched.h
@@ -34,6 +34,8 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
+#define SCHED_ISO 4
+#define SCHED_IDLEPRIO 5
#ifdef __KERNEL__
@@ -876,7 +878,6 @@ struct task_struct {
u64 block_max;
u64 exec_max;
u64 wait_max;
- u64 last_ran;
s64 wait_runtime;
u64 sum_exec_runtime;
@@ -1265,7 +1266,7 @@ static inline int set_cpus_allowed(struc
extern unsigned long long sched_clock(void);
extern void sched_clock_unstable_event(void);
extern unsigned long long
-current_sched_runtime(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
@@ -1284,11 +1285,10 @@ extern void sched_idle_next(void);
extern char * sched_print_task_state(struct task_struct *p, char *buffer);
extern unsigned int sysctl_sched_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_batch_wakeup_granularity;
extern unsigned int sysctl_sched_runtime_limit;
extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_load_smoothing;
-extern int sysctl_sched_yield_bug_workaround;
+extern unsigned int sysctl_sched_features;
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
diff -puN kernel/posix-cpu-timers.c~cfs-scheduler-v16 kernel/posix-cpu-timers.c
--- a/kernel/posix-cpu-timers.c~cfs-scheduler-v16
+++ a/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
}
static inline unsigned long long sched_ns(struct task_struct *p)
{
- return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime;
+ return task_sched_runtime(p);
}
int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
diff -puN kernel/sched.c~cfs-scheduler-v16 kernel/sched.c
--- a/kernel/sched.c~cfs-scheduler-v16
+++ a/kernel/sched.c
@@ -154,12 +154,12 @@ struct rq {
u64 clock, prev_clock_raw;
s64 clock_max_delta;
- u64 fair_clock, prev_fair_clock;
- u64 exec_clock, prev_exec_clock;
+ u64 fair_clock, delta_fair_clock;
+ u64 exec_clock, delta_exec_clock;
s64 wait_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;
- unsigned int clock_warps;
+ unsigned int clock_warps, clock_overflows;
unsigned int clock_unstable_events;
struct sched_class *load_balance_class;
@@ -271,9 +271,17 @@ static inline unsigned long long __rq_cl
clock++;
rq->clock_warps++;
} else {
- if (unlikely(delta > rq->clock_max_delta))
- rq->clock_max_delta = delta;
- clock += delta;
+ /*
+ * Catch too large forward jumps too:
+ */
+ if (delta > 2*TICK_NSEC) {
+ clock++;
+ rq->clock_overflows++;
+ } else {
+ if (unlikely(delta > rq->clock_max_delta))
+ rq->clock_max_delta = delta;
+ clock += delta;
+ }
}
rq->prev_clock_raw = now;
@@ -613,9 +621,9 @@ static void set_load_weight(struct task_
return;
}
/*
- * SCHED_BATCH tasks get minimal weight:
+ * SCHED_IDLEPRIO tasks get minimal weight:
*/
- if (p->policy == SCHED_BATCH) {
+ if (p->policy == SCHED_IDLEPRIO) {
p->load_weight = 1;
return;
}
@@ -1275,7 +1283,7 @@ static void task_running_tick(struct rq
*/
static void __sched_fork(struct task_struct *p)
{
- p->wait_start_fair = p->wait_start = p->exec_start = p->last_ran = 0;
+ p->wait_start_fair = p->wait_start = p->exec_start = 0;
p->sum_exec_runtime = 0;
p->wait_runtime = 0;
@@ -1579,37 +1587,34 @@ unsigned long nr_active(void)
static void update_load_fair(struct rq *this_rq)
{
unsigned long this_load, fair_delta, exec_delta, idle_delta;
+ u64 fair_delta64, exec_delta64, tmp64;
unsigned int i, scale;
- s64 fair_delta64, exec_delta64;
- unsigned long tmp;
- u64 tmp64;
this_rq->nr_load_updates++;
- if (!(sysctl_sched_load_smoothing & 64)) {
+ if (!(sysctl_sched_features & 64)) {
this_load = this_rq->raw_weighted_load;
goto do_avg;
}
- fair_delta64 = this_rq->fair_clock - this_rq->prev_fair_clock + 1;
- this_rq->prev_fair_clock = this_rq->fair_clock;
+ fair_delta64 = this_rq->delta_fair_clock + 1;
+ this_rq->delta_fair_clock = 0;
- exec_delta64 = this_rq->exec_clock - this_rq->prev_exec_clock + 1;
- this_rq->prev_exec_clock = this_rq->exec_clock;
+ exec_delta64 = this_rq->delta_exec_clock + 1;
+ this_rq->delta_exec_clock = 0;
- if (fair_delta64 > (s64)LONG_MAX)
- fair_delta64 = (s64)LONG_MAX;
+ if (fair_delta64 > (u64)LONG_MAX)
+ fair_delta64 = (u64)LONG_MAX;
fair_delta = (unsigned long)fair_delta64;
- if (exec_delta64 > (s64)LONG_MAX)
- exec_delta64 = (s64)LONG_MAX;
+ if (exec_delta64 > (u64)TICK_NSEC)
+ exec_delta64 = (u64)TICK_NSEC;
exec_delta = (unsigned long)exec_delta64;
- if (exec_delta > TICK_NSEC)
- exec_delta = TICK_NSEC;
idle_delta = TICK_NSEC - exec_delta;
- tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
- tmp64 = (u64)tmp * (u64)exec_delta;
+ tmp64 = SCHED_LOAD_SCALE * exec_delta64;
+ do_div(tmp64, fair_delta);
+ tmp64 *= exec_delta64;
do_div(tmp64, TICK_NSEC);
this_load = (unsigned long)tmp64;
@@ -2821,17 +2826,23 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return current->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
*/
-unsigned long long current_sched_runtime(const struct task_struct *p)
+unsigned long long task_sched_runtime(struct task_struct *p)
{
- unsigned long long ns;
unsigned long flags;
+ u64 ns, delta_exec;
+ struct rq *rq;
- local_irq_save(flags);
- ns = p->sum_exec_runtime + sched_clock() - p->last_ran;
- local_irq_restore(flags);
+ rq = task_rq_lock(p, &flags);
+ ns = p->sum_exec_runtime;
+ if (rq->curr == p) {
+ delta_exec = rq_clock(rq) - p->exec_start;
+ if ((s64)delta_exec > 0)
+ ns += delta_exec;
+ }
+ task_rq_unlock(rq, &flags);
return ns;
}
@@ -3565,7 +3576,7 @@ void set_user_nice(struct task_struct *p
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
- * not SCHED_NORMAL/SCHED_BATCH:
+ * SCHED_FIFO/SCHED_RR:
*/
if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
@@ -3714,6 +3725,7 @@ __setscheduler(struct rq *rq, struct tas
switch (p->policy) {
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_IDLEPRIO:
p->sched_class = &fair_sched_class;
break;
case SCHED_FIFO:
@@ -3751,12 +3763,13 @@ recheck:
if (policy < 0)
policy = oldpolicy = p->policy;
else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH)
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLEPRIO)
return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
- * SCHED_BATCH is 0.
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLEPRIO is 0.
*/
if (param->sched_priority < 0 ||
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
@@ -4310,6 +4323,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_IDLEPRIO:
ret = 0;
break;
}
@@ -4334,6 +4348,7 @@ asmlinkage long sys_sched_get_priority_m
break;
case SCHED_NORMAL:
case SCHED_BATCH:
+ case SCHED_IDLEPRIO:
ret = 0;
}
return ret;
@@ -4496,6 +4511,29 @@ void __cpuinit init_idle(struct task_str
*/
cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+ unsigned int factor = 1 + ilog2(num_online_cpus());
+ const unsigned long gran_limit = 10000000;
+
+ sysctl_sched_granularity *= factor;
+ sysctl_sched_runtime_limit *= factor;
+
+ if (sysctl_sched_granularity > gran_limit)
+ sysctl_sched_granularity = gran_limit;
+
+ sysctl_sched_runtime_limit = sysctl_sched_granularity * 2;
+}
+
#ifdef CONFIG_SMP
/*
* This is how migration works:
@@ -5900,25 +5938,12 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
- /*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
- {
- unsigned int factor = 1 + ilog2(num_online_cpus());
-
- sysctl_sched_granularity *= factor;
- sysctl_sched_runtime_limit *= factor;
- }
+ sched_init_granularity();
}
#else
void __init sched_init_smp(void)
{
+ sched_init_granularity();
}
#endif /* CONFIG_SMP */
diff -puN kernel/sched_debug.c~cfs-scheduler-v16 kernel/sched_debug.c
--- a/kernel/sched_debug.c~cfs-scheduler-v16
+++ a/kernel/sched_debug.c
@@ -54,8 +54,7 @@ print_task(struct seq_file *m, struct rq
static void print_rq(struct seq_file *m, struct rq *rq, u64 now)
{
- struct task_struct *p;
- struct rb_node *curr;
+ struct task_struct *g, *p;
SEQ_printf(m,
"\nrunnable tasks:\n"
@@ -68,13 +67,16 @@ static void print_rq(struct seq_file *m,
"------------------------------------------------"
"--------------------------------\n");
- curr = first_fair(rq);
- while (curr) {
- p = rb_entry(curr, struct task_struct, run_node);
+ read_lock_irq(&tasklist_lock);
+
+ do_each_thread(g, p) {
+ if (!p->on_rq)
+ continue;
+
print_task(m, rq, p, now);
+ } while_each_thread(g, p);
- curr = rb_next(curr);
- }
+ read_unlock_irq(&tasklist_lock);
}
static void print_rq_runtime_sum(struct seq_file *m, struct rq *rq)
@@ -117,13 +119,13 @@ static void print_cpu(struct seq_file *m
P(clock);
P(prev_clock_raw);
P(clock_warps);
+ P(clock_overflows);
P(clock_unstable_events);
P(clock_max_delta);
- rq->clock_max_delta = 0;
P(fair_clock);
- P(prev_fair_clock);
+ P(delta_fair_clock);
P(exec_clock);
- P(prev_exec_clock);
+ P(delta_exec_clock);
P(wait_runtime);
P(wait_runtime_overruns);
P(wait_runtime_underruns);
@@ -188,6 +190,18 @@ __initcall(init_sched_debug_procfs);
void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
+ unsigned long flags;
+ int num_threads = 1;
+
+ rcu_read_lock();
+ if (lock_task_sighand(p, &flags)) {
+ num_threads = atomic_read(&p->signal->count);
+ unlock_task_sighand(p, &flags);
+ }
+ rcu_read_unlock();
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m, "----------------------------------------------\n");
#define P(F) \
SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
@@ -201,11 +215,13 @@ void proc_sched_show_task(struct task_st
P(block_max);
P(exec_max);
P(wait_max);
- P(last_ran);
P(wait_runtime);
P(wait_runtime_overruns);
P(wait_runtime_underruns);
P(sum_exec_runtime);
+ P(load_weight);
+ P(policy);
+ P(prio);
#undef P
{
diff -puN kernel/sched_fair.c~cfs-scheduler-v16 kernel/sched_fair.c
--- a/kernel/sched_fair.c~cfs-scheduler-v16
+++ a/kernel/sched_fair.c
@@ -1,5 +1,10 @@
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <[EMAIL PROTECTED]>
+ *
+ * Cleanups and fixes by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <[EMAIL PROTECTED]>
*/
/*
@@ -16,33 +21,24 @@
* number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
* systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
*/
-unsigned int sysctl_sched_granularity __read_mostly = 3000000000ULL/HZ;
+unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
/*
- * Wake-up granularity.
- * (default: 0, units: nanoseconds)
+ * SCHED_BATCH wake-up granularity.
+ * (default: 1 msec, units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 0;
-
-unsigned int sysctl_sched_runtime_limit __read_mostly = 6000000000ULL/HZ;
-
-unsigned int sysctl_sched_load_smoothing __read_mostly = 1 | 2 | 4 | 8 | 0;
-
+unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
+ 1000000000ULL/HZ;
/*
- * sys_sched_yield unfairness bug workaround switch.
- * (default: -1:auto-detect+disabled. Other values: 0:disabled, 1:enabled)
- *
- * This option switches the unfair yield implementation of the
- * old scheduler back on. Needed for good performance of certain
- * apps like 3D games on Radeon cards.
+ * Initialized in sched_init_granularity():
*/
-int sysctl_sched_yield_bug_workaround __read_mostly = 1;
+unsigned int sysctl_sched_runtime_limit __read_mostly;
-EXPORT_SYMBOL_GPL(sysctl_sched_yield_bug_workaround);
+unsigned int sysctl_sched_features __read_mostly = 1 | 2 | 4 | 8 | 0 | 0;
extern struct sched_class fair_sched_class;
@@ -193,14 +189,14 @@ static inline void update_curr(struct rq
u64 delta_exec, delta_fair, delta_mine;
struct task_struct *curr = rq->curr;
- if (curr->sched_class != &fair_sched_class || curr == rq->idle)
+ if (curr->sched_class != &fair_sched_class || curr == rq->idle || !load)
return;
/*
* Get the amount of time the current task was running
* since the last time we changed raw_weighted_load:
*/
delta_exec = now - curr->exec_start;
- if (unlikely(delta_exec < 0))
+ if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
if (unlikely(delta_exec > curr->exec_max))
curr->exec_max = delta_exec;
@@ -209,22 +205,24 @@ static inline void update_curr(struct rq
curr->exec_start = now;
rq->exec_clock += delta_exec;
- if (!load)
- return;
+ delta_fair = delta_exec * NICE_0_LOAD;
+ delta_fair += load >> 1; /* rounding */
+ do_div(delta_fair, load);
+
+ /* Load-balancing accounting. */
+ rq->delta_fair_clock += delta_fair;
+ rq->delta_exec_clock += delta_exec;
+
/*
* Task already marked for preemption, do not burden
* it with the cost of not having left the CPU yet:
*/
- if (unlikely(sysctl_sched_load_smoothing & 1))
+ if (unlikely(sysctl_sched_features & 1))
if (unlikely(test_tsk_thread_flag(curr, TIF_NEED_RESCHED)))
return;
- delta_fair = delta_exec * NICE_0_LOAD;
- delta_fair += load >> 1;
- do_div(delta_fair, load);
-
delta_mine = delta_exec * curr->load_weight;
- delta_mine += load >> 1;
+ delta_mine += load >> 1; /* rounding */
do_div(delta_mine, load);
rq->fair_clock += delta_fair;
@@ -352,7 +350,7 @@ static void distribute_fair_add(struct r
struct task_struct *curr = rq->curr;
s64 delta_fair = 0;
- if (!(sysctl_sched_load_smoothing & 2))
+ if (!(sysctl_sched_features & 2))
return;
if (rq->nr_running) {
@@ -361,7 +359,8 @@ static void distribute_fair_add(struct r
* The currently running task's next wait_runtime value does
* not depend on the fair_clock, so fix it up explicitly:
*/
- add_wait_runtime(rq, curr, -delta_fair);
+ if (curr->sched_class == &fair_sched_class)
+ add_wait_runtime(rq, curr, -delta_fair);
}
rq->fair_clock -= delta_fair;
}
@@ -375,7 +374,7 @@ static void enqueue_sleeper(struct rq *r
unsigned long load = rq->raw_weighted_load;
s64 delta_fair, prev_runtime;
- if (!(sysctl_sched_load_smoothing & 4))
+ if (p->policy == SCHED_BATCH || !(sysctl_sched_features & 4))
goto out;
delta_fair = rq->fair_clock - p->sleep_start_fair;
@@ -384,7 +383,9 @@ static void enqueue_sleeper(struct rq *r
* Fix up delta_fair with the effect of us running
* during the whole sleep period:
*/
- delta_fair = div64_s(delta_fair * load, load + p->load_weight);
+ if (!(sysctl_sched_features & 32))
+ delta_fair = div64_s(delta_fair * load, load + p->load_weight);
+ delta_fair = div64_s(delta_fair * p->load_weight, NICE_0_LOAD);
prev_runtime = p->wait_runtime;
__add_wait_runtime(rq, p, delta_fair);
@@ -476,85 +477,39 @@ dequeue_task_fair(struct rq *rq, struct
static void
yield_task_fair(struct rq *rq, struct task_struct *p, struct task_struct *p_to)
{
- struct rb_node *curr, *next, *first;
struct task_struct *p_next;
- s64 yield_key;
u64 now;
+ now = __rq_clock(rq);
/*
- * Bug workaround for 3D apps running on the radeon 3D driver:
+ * Dequeue and enqueue the task to update its
+ * position within the tree:
*/
- if (unlikely(sysctl_sched_yield_bug_workaround > 0)) {
- if (sysctl_sched_yield_bug_workaround == 2) {
- resched_task(p);
- return;
- }
- now = __rq_clock(rq);
- /*
- * Dequeue and enqueue the task to update its
- * position within the tree:
- */
- dequeue_task_fair(rq, p, 0, now);
- p->on_rq = 0;
- enqueue_task_fair(rq, p, 0, now);
- p->on_rq = 1;
-
- /*
- * Reschedule if another task tops the current one.
- */
- p_next = __pick_next_task_fair(rq);
- if (p_next != p)
- resched_task(p);
- return;
- }
+ dequeue_task_fair(rq, p, 0, now);
+ p->on_rq = 0;
+ enqueue_task_fair(rq, p, 0, now);
+ p->on_rq = 1;
/*
* yield-to support: if we are on the same runqueue then
* give half of our wait_runtime (if it's positive) to the other task:
*/
- if (p_to && rq == task_rq(p_to) && p->wait_runtime > 0) {
+ if (p_to && rq == task_rq(p_to) &&
+ p_to->sched_class == &fair_sched_class
+ && p->wait_runtime > 0) {
+
s64 delta = p->wait_runtime >> 1;
__add_wait_runtime(rq, p_to, delta);
__add_wait_runtime(rq, p, -delta);
}
- curr = &p->run_node;
- first = first_fair(rq);
- /*
- * Move this task to the second place in the tree:
- */
- if (unlikely(curr != first)) {
- next = first;
- } else {
- next = rb_next(curr);
- /*
- * We were the last one already - nothing to do, return
- * and reschedule:
- */
- if (unlikely(!next))
- return;
- }
-
- p_next = rb_entry(next, struct task_struct, run_node);
/*
- * Minimally necessary key value to be the second in the tree:
- */
- yield_key = p_next->fair_key + 1;
-
- now = __rq_clock(rq);
- dequeue_task_fair(rq, p, 0, now);
- p->on_rq = 0;
-
- /*
- * Only update the key if we need to move more backwards
- * than the minimally necessary position to be the second:
+ * Reschedule if another task tops the current one.
*/
- if (p->fair_key < yield_key)
- p->fair_key = yield_key;
-
- __enqueue_task_fair(rq, p);
- p->on_rq = 1;
+ p_next = __pick_next_task_fair(rq);
+ if (p_next != p)
+ resched_task(p);
}
/*
@@ -581,16 +536,23 @@ __check_preempt_curr_fair(struct rq *rq,
static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
{
struct task_struct *curr = rq->curr;
+ unsigned long granularity;
if ((curr == rq->idle) || rt_prio(p->prio)) {
- if (sysctl_sched_load_smoothing & 8) {
+ if (sysctl_sched_features & 8) {
if (rt_prio(p->prio))
update_curr(rq, rq_clock(rq));
}
resched_task(curr);
} else {
- __check_preempt_curr_fair(rq, p, curr,
- sysctl_sched_wakeup_granularity);
+ /*
+ * Batch tasks prefer throughput over latency:
+ */
+ granularity = 0;
+ if (unlikely(p->policy == SCHED_BATCH))
+ granularity = sysctl_sched_batch_wakeup_granularity;
+
+ __check_preempt_curr_fair(rq, p, curr, granularity);
}
}
@@ -624,7 +586,7 @@ static void put_prev_task_fair(struct rq
* preempted), update its position within the tree and
* start the wait period:
*/
- if (sysctl_sched_load_smoothing & 16) {
+ if (sysctl_sched_features & 16) {
if (prev->on_rq &&
test_tsk_thread_flag(prev, TIF_NEED_RESCHED)) {
@@ -735,6 +697,12 @@ static void task_new_fair(struct rq *rq,
*/
p->wait_start_fair = 0;
+ /*
+ * The statistical average of wait_runtime is about
+ * -granularity/2, so initialize the task with that:
+ */
+// p->wait_runtime = -(s64)(sysctl_sched_granularity / 2);
+
__enqueue_task_fair(rq, p);
p->on_rq = 1;
inc_nr_running(p, rq);
diff -puN kernel/sched_rt.c~cfs-scheduler-v16 kernel/sched_rt.c
--- a/kernel/sched_rt.c~cfs-scheduler-v16
+++ a/kernel/sched_rt.c
@@ -3,6 +3,28 @@
* policies)
*/
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void update_curr_rt(struct rq *rq, u64 now)
+{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ if (!has_rt_policy(curr))
+ return;
+
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+ if (unlikely(delta_exec > curr->exec_max))
+ curr->exec_max = delta_exec;
+
+ curr->sum_exec_runtime += delta_exec;
+ curr->exec_start = now;
+}
+
static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
{
@@ -20,6 +42,8 @@ dequeue_task_rt(struct rq *rq, struct ta
{
struct prio_array *array = &rq->active;
+ update_curr_rt(rq, now);
+
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
@@ -54,6 +78,7 @@ static void check_preempt_curr_rt(struct
static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now)
{
struct prio_array *array = &rq->active;
+ struct task_struct *next;
struct list_head *queue;
int idx;
@@ -62,14 +87,17 @@ static struct task_struct * pick_next_ta
return NULL;
queue = array->queue + idx;
- return list_entry(queue->next, struct task_struct, run_list);
+ next = list_entry(queue->next, struct task_struct, run_list);
+
+ next->exec_start = now;
+
+ return next;
}
-/*
- * No accounting done when RT tasks are descheduled:
- */
static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
{
+ update_curr_rt(rq, now);
+ p->exec_start = 0;
}
/*
diff -puN kernel/sysctl.c~cfs-scheduler-v16 kernel/sysctl.c
--- a/kernel/sysctl.c~cfs-scheduler-v16
+++ a/kernel/sysctl.c
@@ -207,6 +207,9 @@ static ctl_table root_table[] = {
{ .ctl_name = 0 }
};
+static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
+static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
+
static ctl_table kern_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
@@ -214,15 +217,21 @@ static ctl_table kern_table[] = {
.data = &sysctl_sched_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_wakeup_granularity_ns",
- .data = &sysctl_sched_wakeup_granularity,
+ .procname = "sched_batch_wakeup_granularity_ns",
+ .data = &sysctl_sched_batch_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
@@ -230,7 +239,10 @@ static ctl_table kern_table[] = {
.data = &sysctl_sched_runtime_limit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
@@ -242,16 +254,8 @@ static ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_load_smoothing",
- .data = &sysctl_sched_load_smoothing,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_yield_bug_workaround",
- .data = &sysctl_sched_yield_bug_workaround,
+ .procname = "sched_features",
+ .data = &sysctl_sched_features,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
_
Patches currently in -mm which might be from [EMAIL PROTECTED] are
rt-mutex-fix-stale-return-value.patch
rt-mutex-fix-chain-walk-early-wakeup-bug.patch
pi-futex-fix-exit-races-and-locking-problems.patch
git-acpi-add-exports.patch
git-kvm.patch
git-selinux.patch
x86_64-irq-check-remote-irr-bit-before-migrating-level-triggered-irq-v3.patch
only-allow-nonlinear-vmas-for-ram-backed-filesystems.patch
cpuset-remove-sched-domain-hooks-from-cpusets.patch
introduce-write_trylock_irqsave.patch
use-write_trylock_irqsave-in-ptrace_attach.patch
fix-stop_machine_run-problem-with-naughty-real-time-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process-fix.patch
pie-randomization.patch
vdso-print-fatal-signals.patch
remove-clockevents_releaserequest_device.patch
add-a-flag-to-indicate-deferrable-timers-in-proc-timer_stats.patch
introduce-o_cloexec-take-2.patch
introduce-o_cloexec-parisc-fix.patch
o_cloexec-for-scm_rights.patch
o_cloexec-for-scm_rights-fix.patch
o_cloexec-for-scm_rights-fix-2.patch
futex-tidy-up-the-code.patch
improve-behaviour-of-spurious-irq-detect.patch
improve-behaviour-of-spurious-irq-detect-fix.patch
lock-debugging-loop-nicer-in-mark_rt_mutex_waiters.patch
cfs-scheduler.patch
cfs-scheduler-vs-detach-schedh-from-mmh.patch
cfs-scheduler-v14-rc2-mm1.patch
cfs-scheduler-warning-fixes.patch
cfs-scheduler-v15-rc3-mm1.patch
fs-proc-basec-make-a-struct-static.patch
cfs-warning-fixes.patch
schedstats-fix-printk-format.patch
cfs-scheduler-v16.patch
sched-add-above-background-load-function.patch
mm-implement-swap-prefetching.patch
fix-raw_spinlock_t-vs-lockdep.patch
lockdep-sanitise-config_prove_locking.patch
lockdep-reduce-the-ifdeffery.patch
lockstat-core-infrastructure.patch
lockstat-core-infrastructure-fix.patch
lockstat-core-infrastructure-fix-fix.patch
lockstat-core-infrastructure-fix-fix-fix.patch
lockstat-human-readability-tweaks.patch
lockstat-hook-into-spinlock_t-rwlock_t-rwsem-and-mutex.patch
detect-atomic-counter-underflows.patch
make-frame_pointer-default=y.patch
mutex-subsystem-synchro-test-module.patch
lockdep-show-held-locks-when-showing-a-stackdump.patch
kmap_atomic-debugging.patch
random-warning-squishes.patch
-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html