subject:"\[GIT PULL\] scheduler fix"

Re: [GIT PULL] scheduler fix

2020-12-27 Thread pr-tracker-bot

The pull request you sent on Sun, 27 Dec 2020 10:16:01 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-2020-12-27

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/3b80dee70eaa5f9a120db058c30cc8e63c443571

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

[GIT PULL] scheduler fix

2020-12-27 Thread Ingo Molnar

Linus,

Please pull the latest sched/urgent git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-2020-12-27

   # HEAD: ae7927023243dcc7389b2d59b16c09cbbeaecc36 sched: Optimize 
finish_lock_switch()

Fix a context switch performance regression.

 Thanks,

Ingo

-->
Peter Zijlstra (1):
  sched: Optimize finish_lock_switch()


 kernel/sched/core.c  | 40 +++-
 kernel/sched/sched.h | 13 +
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7af80c3fce12..0ca7d2dc16d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3985,15 +3985,20 @@ static void do_balance_callbacks(struct rq *rq, struct 
callback_head *head)
}
 }
 
+static void balance_push(struct rq *rq);
+
+struct callback_head balance_push_callback = {
+   .next = NULL,
+   .func = (void (*)(struct callback_head *))balance_push,
+};
+
 static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
 {
struct callback_head *head = rq->balance_callback;
 
lockdep_assert_held(>lock);
-   if (head) {
+   if (head)
rq->balance_callback = NULL;
-   rq->balance_flags &= ~BALANCE_WORK;
-   }
 
return head;
 }
@@ -4014,21 +4019,6 @@ static inline void balance_callbacks(struct rq *rq, 
struct callback_head *head)
}
 }
 
-static void balance_push(struct rq *rq);
-
-static inline void balance_switch(struct rq *rq)
-{
-   if (likely(!rq->balance_flags))
-   return;
-
-   if (rq->balance_flags & BALANCE_PUSH) {
-   balance_push(rq);
-   return;
-   }
-
-   __balance_callbacks(rq);
-}
-
 #else
 
 static inline void __balance_callbacks(struct rq *rq)
@@ -4044,10 +4034,6 @@ static inline void balance_callbacks(struct rq *rq, 
struct callback_head *head)
 {
 }
 
-static inline void balance_switch(struct rq *rq)
-{
-}
-
 #endif
 
 static inline void
@@ -4075,7 +4061,7 @@ static inline void finish_lock_switch(struct rq *rq)
 * prev into current:
 */
spin_acquire(>lock.dep_map, 0, 0, _THIS_IP_);
-   balance_switch(rq);
+   __balance_callbacks(rq);
raw_spin_unlock_irq(>lock);
 }
 
@@ -7256,6 +7242,10 @@ static void balance_push(struct rq *rq)
 
lockdep_assert_held(>lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+   /*
+* Ensure the thing is persistent until balance_push_set(.on = false);
+*/
+   rq->balance_callback = _push_callback;
 
/*
 * Both the cpu-hotplug and stop task are in this case and are
@@ -7305,9 +7295,9 @@ static void balance_push_set(int cpu, bool on)
 
rq_lock_irqsave(rq, );
if (on)
-   rq->balance_flags |= BALANCE_PUSH;
+   rq->balance_callback = _push_callback;
else
-   rq->balance_flags &= ~BALANCE_PUSH;
+   rq->balance_callback = NULL;
rq_unlock_irqrestore(rq, );
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f5acb6c5ce49..12ada79d40f3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -975,7 +975,6 @@ struct rq {
unsigned long   cpu_capacity_orig;
 
struct callback_head*balance_callback;
-   unsigned char   balance_flags;
 
unsigned char   nohz_idle_balance;
unsigned char   idle_balance;
@@ -1226,6 +1225,8 @@ struct rq_flags {
 #endif
 };
 
+extern struct callback_head balance_push_callback;
+
 /*
  * Lockdep annotation that avoids accidental unlocks; it's like a
  * sticky/continuous lockdep_assert_held().
@@ -1243,9 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct 
rq_flags *rf)
 #ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#endif
 #ifdef CONFIG_SMP
-   SCHED_WARN_ON(rq->balance_callback);
+   SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != 
_push_callback);
+#endif
 #endif
 }
 
@@ -1408,9 +1409,6 @@ init_numa_balancing(unsigned long clone_flags, struct 
task_struct *p)
 
 #ifdef CONFIG_SMP
 
-#define BALANCE_WORK   0x01
-#define BALANCE_PUSH   0x02
-
 static inline void
 queue_balance_callback(struct rq *rq,
   struct callback_head *head,
@@ -1418,13 +1416,12 @@ queue_balance_callback(struct rq *rq,
 {
lockdep_assert_held(>lock);
 
-   if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
+   if (unlikely(head->next || rq->balance_callback == 
_push_callback))
return;
 
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
-   rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \

Re: [GIT PULL] scheduler fix

2019-07-14 Thread pr-tracker-bot

The pull request you sent on Sun, 14 Jul 2019 12:19:10 +0200:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/50ec18819cade37914ffc71a8b0a2783c345

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2019-07-14 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: e3d85487fba42206024bc3ed32e4b581c7cb46db sched/core: Fix preempt 
warning in ttwu

Fix a sched statistics related bug that would trigger a kernel warning on 
certain configs.

 Thanks,

Ingo

-->
Peter Zijlstra (1):
  sched/core: Fix preempt warning in ttwu


 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa43ce3962e7..2b037f195473 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
unsigned long flags;
int cpu, success = 0;
 
+   preempt_disable();
if (p == current) {
/*
 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
 *it disabling IRQs (this allows not taking ->pi_lock).
 */
if (!(p->state & state))
-   return false;
+   goto out;
 
success = 1;
cpu = task_cpu(p);
@@ -2526,6 +2527,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
 out:
if (success)
ttwu_stat(p, cpu, wake_flags);
+   preempt_enable();
 
return success;
 }

Re: [GIT PULL] scheduler fix

2019-05-05 Thread pr-tracker-bot

The pull request you sent on Sun, 5 May 2019 13:02:37 +0200:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/70c9fb570b7c1c3edb03cbe745cf81ceeef5d484

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2019-05-05 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 9a4f26cc98d81b67ecc23b890c28e2df324e29f3 sched/cpufreq: Fix kobject 
memleak

Fix a kobject memory leak in the cpufreq code.

 Thanks,

Ingo

-->
Tobin C. Harding (1):
  sched/cpufreq: Fix kobject memleak


 kernel/sched/cpufreq_schedutil.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5c41ea367422..3638d2377e3c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -771,6 +771,7 @@ static int sugov_init(struct cpufreq_policy *policy)
return 0;
 
 fail:
+   kobject_put(>attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);

Re: [GIT PULL] scheduler fix

2019-04-27 Thread pr-tracker-bot

The pull request you sent on Sat, 27 Apr 2019 16:39:26 +0200:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/15d4e26b816a39f2d1ba40bacb8e8ecf8884477c

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2019-04-27 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: a860fa7b96e1a1c974556327aa1aee852d434c21 sched/numa: Fix a possible 
divide-by-zero

Fix a division by zero bug that can trigger in the NUMA placement code.

 Thanks,

Ingo

-->
Xie XiuQi (1):
  sched/numa: Fix a possible divide-by-zero


 kernel/sched/fair.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4d9e14bf138..35f3ea375084 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+   /* Avoid time going backwards, prevent potential divide error: 
*/
+   if (unlikely((s64)*period < 0))
+   *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;

Re: [GIT PULL] scheduler fix

2019-04-12 Thread pr-tracker-bot

The pull request you sent on Fri, 12 Apr 2019 15:08:11 +0200:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/5e6f1fee60a3d80582146835ac01d9808748434f

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2019-04-12 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 0e9f02450da07fc7b1346c8c32c771555173e397 sched/fair: Do not re-read 
->h_load_next during hierarchical load calculation

Fix a NULL pointer dereference crash in certain environments.

 Thanks,

Ingo

-->
Mel Gorman (1):
  sched/fair: Do not re-read ->h_load_next during hierarchical load 
calculation


 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdab7eb6f351..40bd1e27b1b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7784,10 +7784,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
 
-   cfs_rq->h_load_next = NULL;
+   WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
-   cfs_rq->h_load_next = se;
+   WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7797,7 +7797,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
 
-   while ((se = cfs_rq->h_load_next) != NULL) {
+   while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);

Re: [GIT PULL] scheduler fix

2018-12-31 Thread pr-tracker-bot

The pull request you sent on Mon, 31 Dec 2018 15:58:27 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/e3ed513bcf0097c0b8a1f1b4d791a8d0d8933b3b

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2018-12-31 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix infinite 
loop in update_blocked_averages() by reverting a9e7f6544b9c

This is a revert for a lockup in cgroups-intense workloads - the real 
fixes will come later.

Happy new year,

Ingo

-->
Linus Torvalds (1):
  sched/fair: Fix infinite loop in update_blocked_averages() by reverting 
a9e7f6544b9c


 kernel/sched/fair.c | 43 +--
 1 file changed, 9 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d1907506318a..6483834f1278 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
}
 }
 
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
-   list_for_each_entry_safe(cfs_rq, pos, >leaf_cfs_rq_list,\
-leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+   list_for_each_entry_rcu(cfs_rq, >leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
-   for (cfs_rq = >cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq)   \
+   for (cfs_rq = >cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -7647,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-   if (cfs_rq->load.weight)
-   return false;
-
-   if (cfs_rq->avg.load_sum)
-   return false;
-
-   if (cfs_rq->avg.util_sum)
-   return false;
-
-   if (cfs_rq->avg.runnable_load_sum)
-   return false;
-
-   return true;
-}
-
 static void update_blocked_averages(int cpu)
 {
struct rq *rq = cpu_rq(cpu);
-   struct cfs_rq *cfs_rq, *pos;
+   struct cfs_rq *cfs_rq;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7679,7 +7661,7 @@ static void update_blocked_averages(int cpu)
 * Iterates the task_group tree in a bottom up fashion, see
 * list_add_leaf_cfs_rq() for details.
 */
-   for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+   for_each_leaf_cfs_rq(rq, cfs_rq) {
struct sched_entity *se;
 
/* throttled entities do not contribute to load */
@@ -7694,13 +7676,6 @@ static void update_blocked_averages(int cpu)
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, 0);
 
-   /*
-* There can be a lot of idle CPU cgroups.  Don't let fully
-* decayed cfs_rqs linger on the list.
-*/
-   if (cfs_rq_is_decayed(cfs_rq))
-   list_del_leaf_cfs_rq(cfs_rq);
-
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
@@ -10570,10 +10545,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-   struct cfs_rq *cfs_rq, *pos;
+   struct cfs_rq *cfs_rq;
 
rcu_read_lock();
-   for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+   for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
 }

Re: [GIT PULL] scheduler fix

2018-11-18 Thread pr-tracker-bot

The pull request you sent on Sat, 17 Nov 2018 11:57:57 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/03582f338e39ed8f8e8451ef1ef04f060d785a87

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

Re: [GIT PULL] scheduler fix

2018-11-18 Thread pr-tracker-bot

The pull request you sent on Sat, 17 Nov 2018 11:57:57 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/03582f338e39ed8f8e8451ef1ef04f060d785a87

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

[GIT PULL] scheduler fix

2018-11-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: c469933e772132aad040bd6a2adc8edf9ad6f825 sched/fair: Fix 
cpu_util_wake() for 'execl' type workloads

Fix an exec() related scalability/performance regression, which was 
caused by incorrectly calculating load and migrating tasks on exec() when 
they shouldn't be.

 Thanks,

Ingo

-->
Patrick Bellasi (1):
  sched/fair: Fix cpu_util_wake() for 'execl' type workloads


 kernel/sched/fair.c | 62 +
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3648d0300fdf..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
return target;
 }
 
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-   return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+   return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p,
 
avg_load += cfs_rq_load_avg(_rq(i)->cfs);
 
-   spare_cap = capacity_spare_wake(i, p);
+   spare_cap = capacity_spare_without(i, p);
 
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain 
*sd, struct task_struct *p
return prev_cpu;
 
/*
-* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
-* last_update_time.
+* We need task's util for capacity_spare_without, sync it up to
+* prev_cpu's last_update_time.
 */
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(>se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
  */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
struct cfs_rq *cfs_rq;
unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
cfs_rq = _rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
 
-   /* Discount task's blocked util from CPU's util */
+   /* Discount task's util from CPU's util */
util -= min_t(unsigned int, util, task_util(p));
 
/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
 * a) if *p is the only task sleeping on this CPU, then:
 *  cpu_util (== task_util) > util_est (== 0)
 *and thus we return:
-*  cpu_util_wake = (cpu_util - task_util) = 0
+*  cpu_util_without = (cpu_util - task_util) = 0
 *
 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 *IDLE, then:
 *  cpu_util >= task_util
 *  cpu_util > util_est (== 0)
 *and thus we discount *p's blocked utilization to return:
-*  cpu_util_wake = (cpu_util - task_util) >= 0
+*  cpu_util_without = (cpu_util - task_util) >= 0
 *
 * c) if other tasks are RUNNABLE on that CPU and
 *  util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
 * covered by the following code when estimated utilization is
 * enabled.
 */
-   if (sched_feat(UTIL_EST))
-   util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+   if (sched_feat(UTIL_EST)) {
+   unsigned int estimated =
+   READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+   /*
+* Despite the following checks we still have a

[GIT PULL] scheduler fix

2018-11-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: c469933e772132aad040bd6a2adc8edf9ad6f825 sched/fair: Fix 
cpu_util_wake() for 'execl' type workloads

Fix an exec() related scalability/performance regression, which was 
caused by incorrectly calculating load and migrating tasks on exec() when 
they shouldn't be.

 Thanks,

Ingo

-->
Patrick Bellasi (1):
  sched/fair: Fix cpu_util_wake() for 'execl' type workloads


 kernel/sched/fair.c | 62 +
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3648d0300fdf..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
return target;
 }
 
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-   return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+   return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p,
 
avg_load += cfs_rq_load_avg(_rq(i)->cfs);
 
-   spare_cap = capacity_spare_wake(i, p);
+   spare_cap = capacity_spare_without(i, p);
 
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain 
*sd, struct task_struct *p
return prev_cpu;
 
/*
-* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
-* last_update_time.
+* We need task's util for capacity_spare_without, sync it up to
+* prev_cpu's last_update_time.
 */
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(>se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
  */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
struct cfs_rq *cfs_rq;
unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
cfs_rq = _rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
 
-   /* Discount task's blocked util from CPU's util */
+   /* Discount task's util from CPU's util */
util -= min_t(unsigned int, util, task_util(p));
 
/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
 * a) if *p is the only task sleeping on this CPU, then:
 *  cpu_util (== task_util) > util_est (== 0)
 *and thus we return:
-*  cpu_util_wake = (cpu_util - task_util) = 0
+*  cpu_util_without = (cpu_util - task_util) = 0
 *
 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 *IDLE, then:
 *  cpu_util >= task_util
 *  cpu_util > util_est (== 0)
 *and thus we discount *p's blocked utilization to return:
-*  cpu_util_wake = (cpu_util - task_util) >= 0
+*  cpu_util_without = (cpu_util - task_util) >= 0
 *
 * c) if other tasks are RUNNABLE on that CPU and
 *  util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct 
task_struct *p)
 * covered by the following code when estimated utilization is
 * enabled.
 */
-   if (sched_feat(UTIL_EST))
-   util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+   if (sched_feat(UTIL_EST)) {
+   unsigned int estimated =
+   READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+   /*
+* Despite the following checks we still have a

Re: [GIT PULL] scheduler fix

2018-10-11 Thread Greg Kroah-Hartman

On Thu, Oct 11, 2018 at 11:12:37AM +0200, Ingo Molnar wrote:
> Greg,
> 
> Please pull the latest sched-urgent-for-linus git tree from:
> 
>git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

Now merged, thanks.

greg k-h

Re: [GIT PULL] scheduler fix

2018-10-11 Thread Greg Kroah-Hartman

On Thu, Oct 11, 2018 at 11:12:37AM +0200, Ingo Molnar wrote:
> Greg,
> 
> Please pull the latest sched-urgent-for-linus git tree from:
> 
>git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
> sched-urgent-for-linus

Now merged, thanks.

greg k-h

[GIT PULL] scheduler fix

2018-10-11 Thread Ingo Molnar

Greg,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: e054637597ba36d3729ba6a3a3dd7aad8e2a3003 mm, sched/numa: Remove 
remaining traces of NUMA rate-limiting

Cleanup of dead code left over from the recent sched/numa fixes.

 Thanks,

Ingo

-->
Srikar Dronamraju (1):
  mm, sched/numa: Remove remaining traces of NUMA rate-limiting


 include/linux/mmzone.h |  4 
 mm/page_alloc.c| 10 --
 2 files changed, 14 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f4c0b167333..d4b0c79d2924 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -667,10 +667,6 @@ typedef struct pglist_data {
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-   /* Lock serializing the migrate rate limiting window */
-   spinlock_t numabalancing_migrate_lock;
 #endif
/*
 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 706a738c0aee..e2ef1c17942f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6193,15 +6193,6 @@ static unsigned long __init calc_memmap_size(unsigned 
long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void pgdat_init_numabalancing(struct pglist_data *pgdat)
-{
-   spin_lock_init(>numabalancing_migrate_lock);
-}
-#else
-static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void pgdat_init_split_queue(struct pglist_data *pgdat)
 {
@@ -6226,7 +6217,6 @@ static void __meminit pgdat_init_internals(struct 
pglist_data *pgdat)
 {
pgdat_resize_init(pgdat);
 
-   pgdat_init_numabalancing(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
 

- End forwarded message -

[GIT PULL] scheduler fix

2018-10-11 Thread Ingo Molnar

Greg,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: e054637597ba36d3729ba6a3a3dd7aad8e2a3003 mm, sched/numa: Remove 
remaining traces of NUMA rate-limiting

Cleanup of dead code left over from the recent sched/numa fixes.

 Thanks,

Ingo

-->
Srikar Dronamraju (1):
  mm, sched/numa: Remove remaining traces of NUMA rate-limiting


 include/linux/mmzone.h |  4 
 mm/page_alloc.c| 10 --
 2 files changed, 14 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f4c0b167333..d4b0c79d2924 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -667,10 +667,6 @@ typedef struct pglist_data {
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-   /* Lock serializing the migrate rate limiting window */
-   spinlock_t numabalancing_migrate_lock;
 #endif
/*
 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 706a738c0aee..e2ef1c17942f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6193,15 +6193,6 @@ static unsigned long __init calc_memmap_size(unsigned 
long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void pgdat_init_numabalancing(struct pglist_data *pgdat)
-{
-   spin_lock_init(>numabalancing_migrate_lock);
-}
-#else
-static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void pgdat_init_split_queue(struct pglist_data *pgdat)
 {
@@ -6226,7 +6217,6 @@ static void __meminit pgdat_init_internals(struct 
pglist_data *pgdat)
 {
pgdat_resize_init(pgdat);
 
-   pgdat_init_numabalancing(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
 

- End forwarded message -

[GIT PULL] scheduler fix

2018-10-11 Thread Ingo Molnar

Greg,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: e054637597ba36d3729ba6a3a3dd7aad8e2a3003 mm, sched/numa: Remove 
remaining traces of NUMA rate-limiting

Cleanup of dead code left over from the recent sched/numa fixes.

 Thanks,

Ingo

-->
Srikar Dronamraju (1):
  mm, sched/numa: Remove remaining traces of NUMA rate-limiting


 include/linux/mmzone.h |  4 
 mm/page_alloc.c| 10 --
 2 files changed, 14 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f4c0b167333..d4b0c79d2924 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -667,10 +667,6 @@ typedef struct pglist_data {
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-   /* Lock serializing the migrate rate limiting window */
-   spinlock_t numabalancing_migrate_lock;
 #endif
/*
 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 706a738c0aee..e2ef1c17942f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6193,15 +6193,6 @@ static unsigned long __init calc_memmap_size(unsigned 
long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void pgdat_init_numabalancing(struct pglist_data *pgdat)
-{
-   spin_lock_init(>numabalancing_migrate_lock);
-}
-#else
-static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void pgdat_init_split_queue(struct pglist_data *pgdat)
 {
@@ -6226,7 +6217,6 @@ static void __meminit pgdat_init_internals(struct 
pglist_data *pgdat)
 {
pgdat_resize_init(pgdat);
 
-   pgdat_init_numabalancing(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);

[GIT PULL] scheduler fix

2018-10-11 Thread Ingo Molnar

Greg,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: e054637597ba36d3729ba6a3a3dd7aad8e2a3003 mm, sched/numa: Remove 
remaining traces of NUMA rate-limiting

Cleanup of dead code left over from the recent sched/numa fixes.

 Thanks,

Ingo

-->
Srikar Dronamraju (1):
  mm, sched/numa: Remove remaining traces of NUMA rate-limiting


 include/linux/mmzone.h |  4 
 mm/page_alloc.c| 10 --
 2 files changed, 14 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f4c0b167333..d4b0c79d2924 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -667,10 +667,6 @@ typedef struct pglist_data {
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-   /* Lock serializing the migrate rate limiting window */
-   spinlock_t numabalancing_migrate_lock;
 #endif
/*
 * This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 706a738c0aee..e2ef1c17942f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6193,15 +6193,6 @@ static unsigned long __init calc_memmap_size(unsigned 
long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void pgdat_init_numabalancing(struct pglist_data *pgdat)
-{
-   spin_lock_init(>numabalancing_migrate_lock);
-}
-#else
-static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-#endif
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void pgdat_init_split_queue(struct pglist_data *pgdat)
 {
@@ -6226,7 +6217,6 @@ static void __meminit pgdat_init_internals(struct 
pglist_data *pgdat)
 {
pgdat_resize_init(pgdat);
 
-   pgdat_init_numabalancing(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);

[GIT pull] scheduler fix for 4.17

2018-05-13 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

Revert the new NUMA aware placement approach which turned out to create
more problems than it solved.

Thanks,

tglx

-->
Mel Gorman (1):
  Revert "sched/numa: Delay retrying placement for automatic NUMA balance 
after wake_affine()"


 kernel/sched/fair.c | 57 +
 1 file changed, 1 insertion(+), 56 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54dc31e7ab9b..f43627c6bb3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
unsigned long interval = HZ;
-   unsigned long numa_migrate_retry;
 
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
 
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
-   numa_migrate_retry = jiffies + interval;
-
-   /*
-* Check that the new retry threshold is after the current one. If
-* the retry is in the future, it implies that wake_affine has
-* temporarily asked NUMA balancing to backoff from placement.
-*/
-   if (numa_migrate_retry > p->numa_migrate_retry)
-   return;
-
-   /* Safe to try placing the task on the preferred node */
-   p->numa_migrate_retry = numa_migrate_retry;
+   p->numa_migrate_retry = jiffies + interval;
 
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-   unsigned long interval;
-
-   if (!static_branch_likely(_numa_balancing))
-   return;
-
-   /* If balancing has no preference then continue gathering data */
-   if (p->numa_preferred_nid == -1)
-   return;
-
-   /*
-* If the wakeup is not affecting locality then it is neutral from
-* the perspective of NUMA balacing so continue gathering data.
-*/
-   if (cpu_to_node(prev_cpu) == cpu_to_node(target))
-   return;
-
-   /*
-* Temporarily prevent NUMA balancing trying to place waker/wakee after
-* wakee has been moved by wake_affine. This will potentially allow
-* related tasks to converge and update their data placement. The
-* 4 * numa_scan_period is to allow the two-pass filter to migrate
-* hot data to the wakers node.
-*/
-   interval = max(sysctl_numa_balancing_scan_delay,
-p->numa_scan_period << 2);
-   p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-
-   interval = max(sysctl_numa_balancing_scan_delay,
-current->numa_scan_period << 2);
-   current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-}
-#else
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-}
-#endif
-
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   int this_cpu, int prev_cpu, int sync)
 {
@@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;
 
-   update_wa_numa_placement(p, prev_cpu, target);
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;

[GIT pull] scheduler fix for 4.17

2018-05-13 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

Revert the new NUMA aware placement approach which turned out to create
more problems than it solved.

Thanks,

tglx

-->
Mel Gorman (1):
  Revert "sched/numa: Delay retrying placement for automatic NUMA balance 
after wake_affine()"


 kernel/sched/fair.c | 57 +
 1 file changed, 1 insertion(+), 56 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54dc31e7ab9b..f43627c6bb3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1854,7 +1854,6 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
unsigned long interval = HZ;
-   unsigned long numa_migrate_retry;
 
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1862,18 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
 
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
-   numa_migrate_retry = jiffies + interval;
-
-   /*
-* Check that the new retry threshold is after the current one. If
-* the retry is in the future, it implies that wake_affine has
-* temporarily asked NUMA balancing to backoff from placement.
-*/
-   if (numa_migrate_retry > p->numa_migrate_retry)
-   return;
-
-   /* Safe to try placing the task on the preferred node */
-   p->numa_migrate_retry = numa_migrate_retry;
+   p->numa_migrate_retry = jiffies + interval;
 
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -5922,48 +5910,6 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-   unsigned long interval;
-
-   if (!static_branch_likely(_numa_balancing))
-   return;
-
-   /* If balancing has no preference then continue gathering data */
-   if (p->numa_preferred_nid == -1)
-   return;
-
-   /*
-* If the wakeup is not affecting locality then it is neutral from
-* the perspective of NUMA balacing so continue gathering data.
-*/
-   if (cpu_to_node(prev_cpu) == cpu_to_node(target))
-   return;
-
-   /*
-* Temporarily prevent NUMA balancing trying to place waker/wakee after
-* wakee has been moved by wake_affine. This will potentially allow
-* related tasks to converge and update their data placement. The
-* 4 * numa_scan_period is to allow the two-pass filter to migrate
-* hot data to the wakers node.
-*/
-   interval = max(sysctl_numa_balancing_scan_delay,
-p->numa_scan_period << 2);
-   p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-
-   interval = max(sysctl_numa_balancing_scan_delay,
-current->numa_scan_period << 2);
-   current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-}
-#else
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-}
-#endif
-
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   int this_cpu, int prev_cpu, int sync)
 {
@@ -5979,7 +5925,6 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;
 
-   update_wa_numa_placement(p, prev_cpu, target);
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;

[GIT pull] scheduler fix for 4.15

2018-01-28 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single bug fix to prevent a subtle deadlock in the scheduler core code
vs. cpu hotplug.

Thanks,

tglx

-->
Peter Zijlstra (1):
  sched/core: Fix cpu.max vs. cpuhotplug deadlock


 include/linux/jump_label.h |  7 +++
 kernel/jump_label.c| 12 +---
 kernel/sched/fair.c|  4 ++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index c7b368c734af..e0340ca08d98 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_inc_cpuslocked(struct static_key *key);
+extern void static_key_slow_dec_cpuslocked(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -222,6 +224,9 @@ static inline void static_key_slow_dec(struct static_key 
*key)
atomic_dec(>enabled);
 }
 
+#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
+#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)
+
 static inline int jump_label_text_reserved(void *start, void *end)
 {
return 0;
@@ -416,6 +421,8 @@ extern bool wrong_branch_error(void);
 
 #define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
 #define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc_cpuslocked(x)
static_key_slow_inc_cpuslocked(&(x)->key)
+#define static_branch_dec_cpuslocked(x)
static_key_slow_dec_cpuslocked(&(x)->key)
 
 /*
  * Normal usage; boolean enable/disable.
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
int v, v1;
 
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
 
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
   unsigned long rate_limit,
   struct delayed_work *work)
 {
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
  struct delayed_work *work)
 {
cpus_read_lock();
-   static_key_slow_dec_cpuslocked(key, rate_limit, work);
+   __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
 }
 
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+   STATIC_KEY_CHECK_USE(key);
+   __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
 
 void cfs_bandwidth_usage_inc(void)
 {
-   static_key_slow_inc(&__cfs_bandwidth_used);
+   static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 }
 
 void cfs_bandwidth_usage_dec(void)
 {
-   static_key_slow_dec(&__cfs_bandwidth_used);
+   static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)

[GIT pull] scheduler fix for 4.15

2018-01-28 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single bug fix to prevent a subtle deadlock in the scheduler core code
vs. cpu hotplug.

Thanks,

tglx

-->
Peter Zijlstra (1):
  sched/core: Fix cpu.max vs. cpuhotplug deadlock


 include/linux/jump_label.h |  7 +++
 kernel/jump_label.c| 12 +---
 kernel/sched/fair.c|  4 ++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index c7b368c734af..e0340ca08d98 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_inc_cpuslocked(struct static_key *key);
+extern void static_key_slow_dec_cpuslocked(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -222,6 +224,9 @@ static inline void static_key_slow_dec(struct static_key 
*key)
atomic_dec(>enabled);
 }
 
+#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
+#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)
+
 static inline int jump_label_text_reserved(void *start, void *end)
 {
return 0;
@@ -416,6 +421,8 @@ extern bool wrong_branch_error(void);
 
 #define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
 #define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc_cpuslocked(x)
static_key_slow_inc_cpuslocked(&(x)->key)
+#define static_branch_dec_cpuslocked(x)
static_key_slow_dec_cpuslocked(&(x)->key)
 
 /*
  * Normal usage; boolean enable/disable.
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
int v, v1;
 
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
 
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
   unsigned long rate_limit,
   struct delayed_work *work)
 {
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
  struct delayed_work *work)
 {
cpus_read_lock();
-   static_key_slow_dec_cpuslocked(key, rate_limit, work);
+   __static_key_slow_dec_cpuslocked(key, rate_limit, work);
cpus_read_unlock();
 }
 
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+   STATIC_KEY_CHECK_USE(key);
+   __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
+
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
 
 void cfs_bandwidth_usage_inc(void)
 {
-   static_key_slow_inc(&__cfs_bandwidth_used);
+   static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 }
 
 void cfs_bandwidth_usage_dec(void)
 {
-   static_key_slow_dec(&__cfs_bandwidth_used);
+   static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)

[GIT PULL] scheduler fix

2018-01-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: c96f5471ce7d2aefd0dda560cc23f08ab00bc65d delayacct: Account blkio 
completion on the correct task

A delayacct statistics correctness fix.

 Thanks,

Ingo

-->
Josh Snyder (1):
  delayacct: Account blkio completion on the correct task


 include/linux/delayacct.h |  8 
 kernel/delayacct.c| 42 ++
 kernel/sched/core.c   |  6 +++---
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 4178d2493547..5e335b6203f4 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -71,7 +71,7 @@ extern void delayacct_init(void);
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
-extern void __delayacct_blkio_end(void);
+extern void __delayacct_blkio_end(struct task_struct *);
 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 extern void __delayacct_freepages_start(void);
@@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void)
__delayacct_blkio_start();
 }
 
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {
if (current->delays)
-   __delayacct_blkio_end();
+   __delayacct_blkio_end(p);
delayacct_clear_flag(DELAYACCT_PF_BLKIO);
 }
 
@@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct 
*tsk)
 {}
 static inline void delayacct_blkio_start(void)
 {}
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {}
 static inline int delayacct_add_tsk(struct taskstats *d,
struct task_struct *tsk)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
 {
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
 
if (ns > 0) {
-   spin_lock_irqsave(>delays->lock, flags);
+   spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
-   spin_unlock_irqrestore(>delays->lock, flags);
+   spin_unlock_irqrestore(lock, flags);
}
 }
 
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
 }
 
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
 {
-   if (current->delays->flags & DELAYACCT_PF_SWAPIN)
-   /* Swapin block I/O */
-   delayacct_end(>delays->blkio_start,
-   >delays->swapin_delay,
-   >delays->swapin_count);
-   else/* Other block I/O */
-   delayacct_end(>delays->blkio_start,
-   >delays->blkio_delay,
-   >delays->blkio_count);
+   struct task_delay_info *delays = p->delays;
+   u64 *total;
+   u32 *count;
+
+   if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+   total = >swapin_delay;
+   count = >swapin_count;
+   } else {
+   total = >blkio_delay;
+   count = >blkio_count;
+   }
+
+   delayacct_end(>lock, >blkio_start, total, count);
 }
 
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
 
 void __delayacct_freepages_end(void)
 {
-   delayacct_end(>delays->freepages_start,
-   >delays->freepages_delay,
-   >delays->freepages_count);
+   delayacct_end(
+   >delays->lock,
+   >delays->freepages_start,
+   >delays->freepages_delay,
+   >delays->freepages_count);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
p->state = TASK_WAKING;
 
if (p->in_iowait) {
-   delayacct_blkio_end();
+   delayacct_blkio_end(p);

[GIT PULL] scheduler fix

2018-01-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: c96f5471ce7d2aefd0dda560cc23f08ab00bc65d delayacct: Account blkio 
completion on the correct task

A delayacct statistics correctness fix.

 Thanks,

Ingo

-->
Josh Snyder (1):
  delayacct: Account blkio completion on the correct task


 include/linux/delayacct.h |  8 
 kernel/delayacct.c| 42 ++
 kernel/sched/core.c   |  6 +++---
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 4178d2493547..5e335b6203f4 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -71,7 +71,7 @@ extern void delayacct_init(void);
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
-extern void __delayacct_blkio_end(void);
+extern void __delayacct_blkio_end(struct task_struct *);
 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 extern void __delayacct_freepages_start(void);
@@ -122,10 +122,10 @@ static inline void delayacct_blkio_start(void)
__delayacct_blkio_start();
 }
 
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {
if (current->delays)
-   __delayacct_blkio_end();
+   __delayacct_blkio_end(p);
delayacct_clear_flag(DELAYACCT_PF_BLKIO);
 }
 
@@ -169,7 +169,7 @@ static inline void delayacct_tsk_free(struct task_struct 
*tsk)
 {}
 static inline void delayacct_blkio_start(void)
 {}
-static inline void delayacct_blkio_end(void)
+static inline void delayacct_blkio_end(struct task_struct *p)
 {}
 static inline int delayacct_add_tsk(struct taskstats *d,
struct task_struct *tsk)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
 {
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
 
if (ns > 0) {
-   spin_lock_irqsave(>delays->lock, flags);
+   spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
-   spin_unlock_irqrestore(>delays->lock, flags);
+   spin_unlock_irqrestore(lock, flags);
}
 }
 
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
 }
 
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
 {
-   if (current->delays->flags & DELAYACCT_PF_SWAPIN)
-   /* Swapin block I/O */
-   delayacct_end(>delays->blkio_start,
-   >delays->swapin_delay,
-   >delays->swapin_count);
-   else/* Other block I/O */
-   delayacct_end(>delays->blkio_start,
-   >delays->blkio_delay,
-   >delays->blkio_count);
+   struct task_delay_info *delays = p->delays;
+   u64 *total;
+   u32 *count;
+
+   if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+   total = >swapin_delay;
+   count = >swapin_count;
+   } else {
+   total = >blkio_delay;
+   count = >blkio_count;
+   }
+
+   delayacct_end(>lock, >blkio_start, total, count);
 }
 
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
 
 void __delayacct_freepages_end(void)
 {
-   delayacct_end(>delays->freepages_start,
-   >delays->freepages_delay,
-   >delays->freepages_count);
+   delayacct_end(
+   >delays->lock,
+   >delays->freepages_start,
+   >delays->freepages_delay,
+   >delays->freepages_count);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 644fa2e3d993..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
p->state = TASK_WAKING;
 
if (p->in_iowait) {
-   delayacct_blkio_end();
+   delayacct_blkio_end(p);

[GIT PULL] scheduler fix

2017-10-27 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 88796e7e5c457cae72833196cb98e6895dd107e2 sched/swait: Document it 
clearly that the swait facilities are special and shouldn't be used

Update the  documentation to discourage their use.

 Thanks,

Ingo

-->
Davidlohr Bueso (1):
  sched/swait: Document it clearly that the swait facilities are special 
and shouldn't be used


 include/linux/swait.h | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/linux/swait.h b/include/linux/swait.h
index 73e97a08d3d0..cf30f5022472 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -9,13 +9,16 @@
 /*
  * Simple wait queues
  *
- * While these are very similar to the other/complex wait queues (wait.h) the
- * most important difference is that the simple waitqueue allows for
- * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
- * times.
+ * While these are very similar to regular wait queues (wait.h) the most
+ * important difference is that the simple waitqueue allows for deterministic
+ * behaviour -- IOW it has strictly bounded IRQ and lock hold times.
  *
- * In order to make this so, we had to drop a fair number of features of the
- * other waitqueue code; notably:
+ * Mainly, this is accomplished by two things. Firstly not allowing 
swake_up_all
+ * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
+ * priority task a chance to run.
+ *
+ * Secondly, we had to drop a fair number of features of the other waitqueue
+ * code; notably:
  *
  *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
  *all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
@@ -24,12 +27,14 @@
  *  - the exclusive mode; because this requires preserving the list order
  *and this is hard.
  *
- *  - custom wake functions; because you cannot give any guarantees about
- *random code.
- *
- * As a side effect of this; the data structures are slimmer.
+ *  - custom wake callback functions; because you cannot give any guarantees
+ *about random code. This also allows swait to be used in RT, such that
+ *raw spinlock can be used for the swait queue head.
  *
- * One would recommend using this wait queue where possible.
+ * As a side effect of these; the data structures are slimmer albeit more 
ad-hoc.
+ * For all the above, note that simple wait queues should _only_ be used under
+ * very specific realtime constraints -- it is best to stick with the regular
+ * wait queues in most cases.
  */
 
 struct task_struct;

[GIT PULL] scheduler fix

2017-10-27 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 88796e7e5c457cae72833196cb98e6895dd107e2 sched/swait: Document it 
clearly that the swait facilities are special and shouldn't be used

Update the  documentation to discourage their use.

 Thanks,

Ingo

-->
Davidlohr Bueso (1):
  sched/swait: Document it clearly that the swait facilities are special 
and shouldn't be used


 include/linux/swait.h | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/include/linux/swait.h b/include/linux/swait.h
index 73e97a08d3d0..cf30f5022472 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -9,13 +9,16 @@
 /*
  * Simple wait queues
  *
- * While these are very similar to the other/complex wait queues (wait.h) the
- * most important difference is that the simple waitqueue allows for
- * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
- * times.
+ * While these are very similar to regular wait queues (wait.h) the most
+ * important difference is that the simple waitqueue allows for deterministic
+ * behaviour -- IOW it has strictly bounded IRQ and lock hold times.
  *
- * In order to make this so, we had to drop a fair number of features of the
- * other waitqueue code; notably:
+ * Mainly, this is accomplished by two things. Firstly not allowing 
swake_up_all
+ * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher
+ * priority task a chance to run.
+ *
+ * Secondly, we had to drop a fair number of features of the other waitqueue
+ * code; notably:
  *
  *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
  *all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
@@ -24,12 +27,14 @@
  *  - the exclusive mode; because this requires preserving the list order
  *and this is hard.
  *
- *  - custom wake functions; because you cannot give any guarantees about
- *random code.
- *
- * As a side effect of this; the data structures are slimmer.
+ *  - custom wake callback functions; because you cannot give any guarantees
+ *about random code. This also allows swait to be used in RT, such that
+ *raw spinlock can be used for the swait queue head.
  *
- * One would recommend using this wait queue where possible.
+ * As a side effect of these; the data structures are slimmer albeit more 
ad-hoc.
+ * For all the above, note that simple wait queues should _only_ be used under
+ * very specific realtime constraints -- it is best to stick with the regular
+ * wait queues in most cases.
  */
 
 struct task_struct;

[GIT pull] scheduler fix for 4.12

2017-05-21 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single scheduler fix:

  Prevent idle task from ever being preempted. That makes sure that
  synchronize_rcu_tasks() which is ignoring idle task does not pretend that
  no task is stuck in preempted state. If that happens and idle was
  preempted on a ftrace trampoline the machine crashes due to inconsistent
  state.

Thanks,

tglx

-->
Steven Rostedt (VMware) (1):
  sched/core: Call __schedule() from do_idle() without enabling preemption


 kernel/sched/core.c  | 25 +
 kernel/sched/idle.c  |  2 +-
 kernel/sched/sched.h |  2 ++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759f4bd52cd6..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
 
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+   /*
+* As this skips calling sched_submit_work(), which the idle task does
+* regardless because that function is a nop when the task is in a
+* TASK_RUNNING state, make sure this isn't used someplace that the
+* current task can be in any other state. Note, idle is always in the
+* TASK_RUNNING state.
+*/
+   WARN_ON_ONCE(current->state);
+   do {
+   __schedule(false);
+   } while (need_resched());
+}
+
 #ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage __visible void __sched schedule_user(void)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2a25a9ec2c6e..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -265,7 +265,7 @@ static void do_idle(void)
smp_mb__after_atomic();
 
sched_ttwu_pending();
-   schedule_preempt_disabled();
+   schedule_idle();
 
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7808ab050599..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1467,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct 
rq *rq)
 }
 #endif
 
+extern void schedule_idle(void);
+
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);

[GIT pull] scheduler fix for 4.12

2017-05-21 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single scheduler fix:

  Prevent idle task from ever being preempted. That makes sure that
  synchronize_rcu_tasks() which is ignoring idle task does not pretend that
  no task is stuck in preempted state. If that happens and idle was
  preempted on a ftrace trampoline the machine crashes due to inconsistent
  state.

Thanks,

tglx

-->
Steven Rostedt (VMware) (1):
  sched/core: Call __schedule() from do_idle() without enabling preemption


 kernel/sched/core.c  | 25 +
 kernel/sched/idle.c  |  2 +-
 kernel/sched/sched.h |  2 ++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759f4bd52cd6..803c3bc274c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
 
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+   /*
+* As this skips calling sched_submit_work(), which the idle task does
+* regardless because that function is a nop when the task is in a
+* TASK_RUNNING state, make sure this isn't used someplace that the
+* current task can be in any other state. Note, idle is always in the
+* TASK_RUNNING state.
+*/
+   WARN_ON_ONCE(current->state);
+   do {
+   __schedule(false);
+   } while (need_resched());
+}
+
 #ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage __visible void __sched schedule_user(void)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2a25a9ec2c6e..ef63adce0c9c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -265,7 +265,7 @@ static void do_idle(void)
smp_mb__after_atomic();
 
sched_ttwu_pending();
-   schedule_preempt_disabled();
+   schedule_idle();
 
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7808ab050599..6dda2aab731e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1467,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct 
rq *rq)
 }
 #endif
 
+extern void schedule_idle(void);
+
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);

[GIT pull] scheduler fix for 4.11

2017-04-30 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single fix for a cputime accounting regression whihc got introduced in
the 4.11 cycle.

Thanks,

tglx

-->
Frederic Weisbecker (1):
  sched/cputime: Fix ksoftirqd cputime accounting regression


 kernel/sched/cputime.c | 27 ---
 kernel/sched/sched.h   |  9 +++--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
 }
 
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+   u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+   u64_stats_update_begin(>sync);
+   cpustat[idx] += delta;
+   irqtime->total += delta;
+   irqtime->tick_delta += delta;
+   u64_stats_update_end(>sync);
+}
+
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
struct irqtime *irqtime = this_cpu_ptr(_irqtime);
-   u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
 
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
 
-   u64_stats_update_begin(>sync);
/*
 * We do not account for softirq time from ksoftirqd here.
 * We want to continue accounting softirq time to ksoftirqd thread
 * in that case, so as not to confuse scheduler with a special task
 * that do not consume any time, but still wants to run.
 */
-   if (hardirq_count()) {
-   cpustat[CPUTIME_IRQ] += delta;
-   irqtime->tick_delta += delta;
-   } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
-   cpustat[CPUTIME_SOFTIRQ] += delta;
-   irqtime->tick_delta += delta;
-   }
-
-   u64_stats_update_end(>sync);
+   if (hardirq_count())
+   irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+   else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+   irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..767aab3505a8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int 
cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+   u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync   sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
 
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
struct irqtime *irqtime = _cpu(cpu_irqtime, cpu);
-   u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
 
do {
seq = __u64_stats_fetch_begin(>sync);
-   total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+   total = irqtime->total;
} while (__u64_stats_fetch_retry(>sync, seq));
 
return total;

[GIT pull] scheduler fix for 4.11

2017-04-30 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single fix for a cputime accounting regression whihc got introduced in
the 4.11 cycle.

Thanks,

tglx

-->
Frederic Weisbecker (1):
  sched/cputime: Fix ksoftirqd cputime accounting regression


 kernel/sched/cputime.c | 27 ---
 kernel/sched/sched.h   |  9 +++--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b46c8..aea3135c5d90 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
 }
 
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+   u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+   u64_stats_update_begin(>sync);
+   cpustat[idx] += delta;
+   irqtime->total += delta;
+   irqtime->tick_delta += delta;
+   u64_stats_update_end(>sync);
+}
+
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
struct irqtime *irqtime = this_cpu_ptr(_irqtime);
-   u64 *cpustat = kcpustat_this_cpu->cpustat;
s64 delta;
int cpu;
 
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
 
-   u64_stats_update_begin(>sync);
/*
 * We do not account for softirq time from ksoftirqd here.
 * We want to continue accounting softirq time to ksoftirqd thread
 * in that case, so as not to confuse scheduler with a special task
 * that do not consume any time, but still wants to run.
 */
-   if (hardirq_count()) {
-   cpustat[CPUTIME_IRQ] += delta;
-   irqtime->tick_delta += delta;
-   } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
-   cpustat[CPUTIME_SOFTIRQ] += delta;
-   irqtime->tick_delta += delta;
-   }
-
-   u64_stats_update_end(>sync);
+   if (hardirq_count())
+   irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+   else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+   irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..767aab3505a8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int 
cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+   u64 total;
u64 tick_delta;
u64 irq_start_time;
struct u64_stats_sync   sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
 
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
struct irqtime *irqtime = _cpu(cpu_irqtime, cpu);
-   u64 *cpustat = kcpustat_cpu(cpu).cpustat;
unsigned int seq;
u64 total;
 
do {
seq = __u64_stats_fetch_begin(>sync);
-   total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+   total = irqtime->total;
} while (__u64_stats_fetch_retry(>sync, seq));
 
return total;

[GIT PULL] scheduler fix

2016-12-07 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 83929cce95251cc77e5659bf493bd424ae0e7a67 sched/autogroup: Fix 64-bit 
kernel nice level adjustment

An autogroup nice level adjustment bug fix.

 Thanks,

Ingo

-->
Mike Galbraith (1):
  sched/autogroup: Fix 64-bit kernel nice level adjustment


 kernel/sched/auto_group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index f1c8fd566246..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, 
int nice)
 {
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
+   unsigned long shares;
int err;
 
if (nice < MIN_NICE || nice > MAX_NICE)
@@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, 
int nice)
 
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
+   shares = scale_load(sched_prio_to_weight[nice + 20]);
 
down_write(>lock);
-   err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+   err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(>lock);

[GIT PULL] scheduler fix

2016-12-07 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 83929cce95251cc77e5659bf493bd424ae0e7a67 sched/autogroup: Fix 64-bit 
kernel nice level adjustment

An autogroup nice level adjustment bug fix.

 Thanks,

Ingo

-->
Mike Galbraith (1):
  sched/autogroup: Fix 64-bit kernel nice level adjustment


 kernel/sched/auto_group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index f1c8fd566246..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, 
int nice)
 {
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
+   unsigned long shares;
int err;
 
if (nice < MIN_NICE || nice > MAX_NICE)
@@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, 
int nice)
 
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
+   shares = scale_load(sched_prio_to_weight[nice + 20]);
 
down_write(>lock);
-   err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+   err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(>lock);

[GIT PULL] scheduler fix

2016-10-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b sched/fair: Remove unused 
but set variable 'rq'

An unused variable warning fix.

 Thanks,

Ingo

-->
Tobias Klauser (1):
  sched/fair: Remove unused but set variable 'rq'


 kernel/sched/fair.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d941c97dfbc3..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
 {
struct sched_entity *se;
struct cfs_rq *cfs_rq;
-   struct rq *rq;
int i;
 
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
for_each_possible_cpu(i) {
-   rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)

[GIT PULL] scheduler fix

2016-10-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b sched/fair: Remove unused 
but set variable 'rq'

An unused variable warning fix.

 Thanks,

Ingo

-->
Tobias Klauser (1):
  sched/fair: Remove unused but set variable 'rq'


 kernel/sched/fair.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d941c97dfbc3..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
 {
struct sched_entity *se;
struct cfs_rq *cfs_rq;
-   struct rq *rq;
int i;
 
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct 
task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
for_each_possible_cpu(i) {
-   rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)

[GIT PULL] scheduler fix

2016-10-19 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: b5a9b340789b2b24c6896bcf7a065c31a4db671c sched/fair: Fix incorrect 
task group ->load_avg

This fixes a group scheduling related performance/interactivity regression 
introduced in v4.8, which affects certain hardware environments where 
cpu_possible_mask != cpu_present_mask.

 Thanks,

Ingo

-->
Vincent Guittot (1):
  sched/fair: Fix incorrect task group ->load_avg


 kernel/sched/fair.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76ee7de1859d..d941c97dfbc3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
 * will definitely be update (after enqueue).
 */
sa->period_contrib = 1023;
-   sa->load_avg = scale_load_down(se->load.weight);
+   /*
+* Tasks are intialized with full load to be seen as heavy tasks until
+* they get a chance to stabilize to their real load level.
+* Group entities are intialized with zero load to reflect the fact that
+* nothing has been attached to the task group yet.
+*/
+   if (entity_is_task(se))
+   sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway

[GIT PULL] scheduler fix

2016-10-19 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: b5a9b340789b2b24c6896bcf7a065c31a4db671c sched/fair: Fix incorrect 
task group ->load_avg

This fixes a group scheduling related performance/interactivity regression 
introduced in v4.8, which affects certain hardware environments where 
cpu_possible_mask != cpu_present_mask.

 Thanks,

Ingo

-->
Vincent Guittot (1):
  sched/fair: Fix incorrect task group ->load_avg


 kernel/sched/fair.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76ee7de1859d..d941c97dfbc3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
 * will definitely be update (after enqueue).
 */
sa->period_contrib = 1023;
-   sa->load_avg = scale_load_down(se->load.weight);
+   /*
+* Tasks are intialized with full load to be seen as heavy tasks until
+* they get a chance to stabilize to their real load level.
+* Group entities are intialized with zero load to reflect the fact that
+* nothing has been attached to the task group yet.
+*/
+   if (entity_is_task(se))
+   sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway

[GIT PULL] scheduler fix

2016-10-18 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 sched/fair: Fix sched 
domains NULL dereference in select_idle_sibling()

Fix a crash that can trigger when racing with CPU hotplug: we didn't use 
sched-domains data structures carefully enough in select_idle_cpu().

 Thanks,

Ingo

-->
Wanpeng Li (1):
  sched/fair: Fix sched domains NULL dereference in select_idle_sibling()


 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a6e927..8b03fb5d1b9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct 
*p, struct sched_domain *sd
  */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
 {
-   struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(_llc));
-   u64 avg_idle = this_rq()->avg_idle;
-   u64 avg_cost = this_sd->avg_scan_cost;
+   struct sched_domain *this_sd;
+   u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
int cpu, wrap;
 
+   this_sd = rcu_dereference(*this_cpu_ptr(_llc));
+   if (!this_sd)
+   return -1;
+
+   avg_cost = this_sd->avg_scan_cost;
+
/*
 * Due to large variance we need a large fuzz factor; hackbench in
 * particularly is sensitive here.

[GIT PULL] scheduler fix

2016-10-18 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 sched/fair: Fix sched 
domains NULL dereference in select_idle_sibling()

Fix a crash that can trigger when racing with CPU hotplug: we didn't use 
sched-domains data structures carefully enough in select_idle_cpu().

 Thanks,

Ingo

-->
Wanpeng Li (1):
  sched/fair: Fix sched domains NULL dereference in select_idle_sibling()


 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a6e927..8b03fb5d1b9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct 
*p, struct sched_domain *sd
  */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
 {
-   struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(_llc));
-   u64 avg_idle = this_rq()->avg_idle;
-   u64 avg_cost = this_sd->avg_scan_cost;
+   struct sched_domain *this_sd;
+   u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
int cpu, wrap;
 
+   this_sd = rcu_dereference(*this_cpu_ptr(_llc));
+   if (!this_sd)
+   return -1;
+
+   avg_cost = this_sd->avg_scan_cost;
+
/*
 * Due to large variance we need a large fuzz factor; hackbench in
 * particularly is sensitive here.

[GIT PULL] scheduler fix

2016-09-13 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf sched/core: Fix a race 
between try_to_wake_up() and a woken up task

A try_to_wake_up() memory ordering race fix causing a busy-loop in ttwu().

 Thanks,

Ingo

-->
Balbir Singh (1):
  sched/core: Fix a race between try_to_wake_up() and a woken up task


 kernel/sched/core.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a906f20fba7..44817c640e99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int 
state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
 
+   /*
+* Ensure we load p->on_rq _after_ p->state, otherwise it would
+* be possible to, falsely, observe p->on_rq == 0 and get stuck
+* in smp_cond_load_acquire() below.
+*
+* sched_ttwu_pending() try_to_wake_up()
+*   [S] p->on_rq = 1;  [L] P->state
+*   UNLOCK rq->lock  -.
+*  \
+*   +---   RMB
+* schedule()   /
+*   LOCK rq->lock-'
+*   UNLOCK rq->lock
+*
+* [task p]
+*   [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+*
+* Pairs with the UNLOCK+LOCK on rq->lock from the
+* last wakeup of our task and the schedule that got our task
+* current.
+*/
+   smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;

[GIT PULL] scheduler fix

2016-09-13 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf sched/core: Fix a race 
between try_to_wake_up() and a woken up task

A try_to_wake_up() memory ordering race fix causing a busy-loop in ttwu().

 Thanks,

Ingo

-->
Balbir Singh (1):
  sched/core: Fix a race between try_to_wake_up() and a woken up task


 kernel/sched/core.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a906f20fba7..44817c640e99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int 
state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
 
+   /*
+* Ensure we load p->on_rq _after_ p->state, otherwise it would
+* be possible to, falsely, observe p->on_rq == 0 and get stuck
+* in smp_cond_load_acquire() below.
+*
+* sched_ttwu_pending() try_to_wake_up()
+*   [S] p->on_rq = 1;  [L] P->state
+*   UNLOCK rq->lock  -.
+*  \
+*   +---   RMB
+* schedule()   /
+*   LOCK rq->lock-'
+*   UNLOCK rq->lock
+*
+* [task p]
+*   [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+*
+* Pairs with the UNLOCK+LOCK on rq->lock from the
+* last wakeup of our task and the schedule that got our task
+* current.
+*/
+   smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;

[GIT PULL] scheduler fix

2016-07-14 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: d60585c5766e9620d5d83e2b25dc042c7bdada2c sched/core: Correct off by 
one bug in load migration calculation

Fix a CPU hotplug related corruption of the load average that got introduced in 
this merge window.

 Thanks,

Ingo

-->
Thomas Gleixner (1):
  sched/core: Correct off by one bug in load migration calculation


 kernel/sched/core.c| 6 --
 kernel/sched/loadavg.c | 8 
 kernel/sched/sched.h   | 2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105f529a..97ee9ac7e97c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5394,13 +5394,15 @@ void idle_task_exit(void)
 /*
  * Since this CPU is going 'away' for a while, fold any nr_active delta
  * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
  *
  * Also see the comment "Global load-average calculations".
  */
 static void calc_load_migrate(struct rq *rq)
 {
-   long delta = calc_load_fold_active(rq);
+   long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, _load_tasks);
 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long 
offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
 }
 
-long calc_load_fold_active(struct rq *this_rq)
+long calc_load_fold_active(struct rq *this_rq, long adjust)
 {
long nr_active, delta = 0;
 
-   nr_active = this_rq->nr_running;
+   nr_active = this_rq->nr_running - adjust;
nr_active += (long)this_rq->nr_uninterruptible;
 
if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
 * We're going into NOHZ mode, if there's any pending delta, fold it
 * into the pending idle delta.
 */
-   delta = calc_load_fold_active(this_rq);
+   delta = calc_load_fold_active(this_rq, 0);
if (delta) {
int idx = calc_load_write_idx();
 
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
if (time_before(jiffies, this_rq->calc_load_update))
return;
 
-   delta  = calc_load_fold_active(this_rq);
+   delta  = calc_load_fold_active(this_rq, 0);
if (delta)
atomic_long_add(delta, _load_tasks);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92a1cb9..898c0d2f18fe 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
 extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
 #ifdef CONFIG_SMP
 extern void cpu_load_update_active(struct rq *this_rq);

[GIT PULL] scheduler fix

2016-07-14 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: d60585c5766e9620d5d83e2b25dc042c7bdada2c sched/core: Correct off by 
one bug in load migration calculation

Fix a CPU hotplug related corruption of the load average that got introduced in 
this merge window.

 Thanks,

Ingo

-->
Thomas Gleixner (1):
  sched/core: Correct off by one bug in load migration calculation


 kernel/sched/core.c| 6 --
 kernel/sched/loadavg.c | 8 
 kernel/sched/sched.h   | 2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105f529a..97ee9ac7e97c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5394,13 +5394,15 @@ void idle_task_exit(void)
 /*
  * Since this CPU is going 'away' for a while, fold any nr_active delta
  * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
  *
  * Also see the comment "Global load-average calculations".
  */
 static void calc_load_migrate(struct rq *rq)
 {
-   long delta = calc_load_fold_active(rq);
+   long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, _load_tasks);
 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long 
offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
 }
 
-long calc_load_fold_active(struct rq *this_rq)
+long calc_load_fold_active(struct rq *this_rq, long adjust)
 {
long nr_active, delta = 0;
 
-   nr_active = this_rq->nr_running;
+   nr_active = this_rq->nr_running - adjust;
nr_active += (long)this_rq->nr_uninterruptible;
 
if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
 * We're going into NOHZ mode, if there's any pending delta, fold it
 * into the pending idle delta.
 */
-   delta = calc_load_fold_active(this_rq);
+   delta = calc_load_fold_active(this_rq, 0);
if (delta) {
int idx = calc_load_write_idx();
 
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
if (time_before(jiffies, this_rq->calc_load_update))
return;
 
-   delta  = calc_load_fold_active(this_rq);
+   delta  = calc_load_fold_active(this_rq, 0);
if (delta)
atomic_long_add(delta, _load_tasks);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92a1cb9..898c0d2f18fe 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
 extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
 #ifdef CONFIG_SMP
 extern void cpu_load_update_active(struct rq *this_rq);

[GIT PULL] scheduler fix

2016-05-13 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 53d3bc773eaa7ab1cf63585e76af7ee869d5e709 Revert "sched/fair: Fix 
fairness issue on migration"

This is a revert to fix an interactivity problem. The proper fixes for the 
problems that the reverted commit exposed are now in sched/core (consisting of 
3 
patches), but were too risky for v4.6 and will arrive in the v4.7 merge window.

 Thanks,

Ingo

-->
Ingo Molnar (1):
  Revert "sched/fair: Fix fairness issue on migration"


 kernel/sched/fair.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40748dc8ea3e..e7dd0ec169be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3188,25 +3188,17 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-   bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
-   bool curr = cfs_rq->curr == se;
-
/*
-* If we're the current task, we must renormalise before calling
-* update_curr().
+* Update the normalized vruntime before updating min_vruntime
+* through calling update_curr().
 */
-   if (renorm && curr)
+   if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
 
-   update_curr(cfs_rq);
-
/*
-* Otherwise, renormalise after, such that we're placed at the current
-* moment in time, instead of some random moment in the past.
+* Update run-time statistics of the 'current'.
 */
-   if (renorm && !curr)
-   se->vruntime += cfs_rq->min_vruntime;
-
+   update_curr(cfs_rq);
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3222,7 +3214,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
-   if (!curr)
+   if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

[GIT PULL] scheduler fix

2016-05-13 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 53d3bc773eaa7ab1cf63585e76af7ee869d5e709 Revert "sched/fair: Fix 
fairness issue on migration"

This is a revert to fix an interactivity problem. The proper fixes for the 
problems that the reverted commit exposed are now in sched/core (consisting of 
3 
patches), but were too risky for v4.6 and will arrive in the v4.7 merge window.

 Thanks,

Ingo

-->
Ingo Molnar (1):
  Revert "sched/fair: Fix fairness issue on migration"


 kernel/sched/fair.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40748dc8ea3e..e7dd0ec169be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3188,25 +3188,17 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-   bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
-   bool curr = cfs_rq->curr == se;
-
/*
-* If we're the current task, we must renormalise before calling
-* update_curr().
+* Update the normalized vruntime before updating min_vruntime
+* through calling update_curr().
 */
-   if (renorm && curr)
+   if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
 
-   update_curr(cfs_rq);
-
/*
-* Otherwise, renormalise after, such that we're placed at the current
-* moment in time, instead of some random moment in the past.
+* Update run-time statistics of the 'current'.
 */
-   if (renorm && !curr)
-   se->vruntime += cfs_rq->min_vruntime;
-
+   update_curr(cfs_rq);
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3222,7 +3214,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
-   if (!curr)
+   if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;

[GIT PULL] scheduler fix

2016-05-06 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 2548d546d40c0014efdde88a53bf7896e917dcce nohz/full, sched/rt: Fix 
missed tick-reenabling bug in sched_can_stop_tick()

This tree contains a single fix that fixes a nohz tick stopping bug when 
mixed-poliocy SCHED_FIFO and SCHED_RR tasks are present on a runqueue.

 Thanks,

Ingo

-->
Peter Zijlstra (1):
  nohz/full, sched/rt: Fix missed tick-reenabling bug in 
sched_can_stop_tick()


 kernel/sched/core.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..d1f7149f8704 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -596,17 +596,8 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
 
/*
-* FIFO realtime policy runs the highest priority task (after DEADLINE).
-* Other runnable tasks are of a lower priority. The scheduler tick
-* isn't needed.
-*/
-   fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
-   if (fifo_nr_running)
-   return true;
-
-   /*
-* Round-robin realtime tasks time slice with other tasks at the same
-* realtime priority.
+* If there are more than one RR tasks, we need the tick to effect the
+* actual RR behaviour.
 */
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
@@ -615,8 +606,20 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
}
 
-   /* Normal multitasking need periodic preemption checks */
-   if (rq->cfs.nr_running > 1)
+   /*
+* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+* forced preemption between FIFO tasks.
+*/
+   fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+   if (fifo_nr_running)
+   return true;
+
+   /*
+* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+* if there's more than one we need the tick for involuntary
+* preemption.
+*/
+   if (rq->nr_running > 1)
return false;
 
return true;

[GIT PULL] scheduler fix

2016-05-06 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 2548d546d40c0014efdde88a53bf7896e917dcce nohz/full, sched/rt: Fix 
missed tick-reenabling bug in sched_can_stop_tick()

This tree contains a single fix that fixes a nohz tick stopping bug when 
mixed-poliocy SCHED_FIFO and SCHED_RR tasks are present on a runqueue.

 Thanks,

Ingo

-->
Peter Zijlstra (1):
  nohz/full, sched/rt: Fix missed tick-reenabling bug in 
sched_can_stop_tick()


 kernel/sched/core.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..d1f7149f8704 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -596,17 +596,8 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
 
/*
-* FIFO realtime policy runs the highest priority task (after DEADLINE).
-* Other runnable tasks are of a lower priority. The scheduler tick
-* isn't needed.
-*/
-   fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
-   if (fifo_nr_running)
-   return true;
-
-   /*
-* Round-robin realtime tasks time slice with other tasks at the same
-* realtime priority.
+* If there are more than one RR tasks, we need the tick to effect the
+* actual RR behaviour.
 */
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
@@ -615,8 +606,20 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
}
 
-   /* Normal multitasking need periodic preemption checks */
-   if (rq->cfs.nr_running > 1)
+   /*
+* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+* forced preemption between FIFO tasks.
+*/
+   fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+   if (fifo_nr_running)
+   return true;
+
+   /*
+* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+* if there's more than one we need the tick for involuntary
+* preemption.
+*/
+   if (rq->nr_running > 1)
return false;
 
return true;

[GIT pull] scheduler fix for 4.3

2015-09-27 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single buf fix for the scheduler:

  - Prevent dequeueing of the idle task when setting the cpus allowed
mask.

Thanks,

tglx

-->
Peter Zijlstra (1):
  sched: Fix crash trying to dequeue/enqueue the idle thread


 kernel/sched/core.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97d276ff1edb..f0d043ec0182 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
 
-   do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+   /*
+* Its possible that init_idle() gets called multiple times on a task,
+* in that case do_set_cpus_allowed() will not do the right thing.
+*
+* And since this is boot we can forgo the serialization.
+*/
+   set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
/*
 * We're having a chicken and egg problem, even though we are
 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
idle->on_cpu = 1;
 #endif
raw_spin_unlock(>lock);
@@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->sched_class = _sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT pull] scheduler fix for 4.3

2015-09-27 Thread Thomas Gleixner

Linus,

please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

A single buf fix for the scheduler:

  - Prevent dequeueing of the idle task when setting the cpus allowed
mask.

Thanks,

tglx

-->
Peter Zijlstra (1):
  sched: Fix crash trying to dequeue/enqueue the idle thread


 kernel/sched/core.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97d276ff1edb..f0d043ec0182 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
 
-   do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+   /*
+* Its possible that init_idle() gets called multiple times on a task,
+* in that case do_set_cpus_allowed() will not do the right thing.
+*
+* And since this is boot we can forgo the serialization.
+*/
+   set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
/*
 * We're having a chicken and egg problem, even though we are
 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
idle->on_cpu = 1;
 #endif
raw_spin_unlock(>lock);
@@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->sched_class = _sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2015-07-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: d49db342f0e276b354383b3281c5668b6b80f5c2 sched/fair: Test list head 
instead of list entry in throttle_cfs_rq()

A rq throttling fix.

 Thanks,

Ingo

-->
Cong Wang (1):
  sched/fair: Test list head instead of list entry in throttle_cfs_rq()


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 65c8f3ebdc3c..d113c3ba8bc4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3683,7 +3683,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(_b->lock);
-   empty = list_empty(_rq->throttled_list);
+   empty = list_empty(_b->throttled_cfs_rq);
 
/*
 * Add to the _head_ of the list, so that an already-started
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2015-07-17 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: d49db342f0e276b354383b3281c5668b6b80f5c2 sched/fair: Test list head 
instead of list entry in throttle_cfs_rq()

A rq throttling fix.

 Thanks,

Ingo

--
Cong Wang (1):
  sched/fair: Test list head instead of list entry in throttle_cfs_rq()


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 65c8f3ebdc3c..d113c3ba8bc4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3683,7 +3683,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq-throttled = 1;
cfs_rq-throttled_clock = rq_clock(rq);
raw_spin_lock(cfs_b-lock);
-   empty = list_empty(cfs_rq-throttled_list);
+   empty = list_empty(cfs_b-throttled_cfs_rq);
 
/*
 * Add to the _head_ of the list, so that an already-started
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2015-03-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 746db9443ea57fd9c059f62c4bfbf41cf224fe13 sched: Fix RLIMIT_RTTIME 
when PI-boosting to RT

A single sched/rt corner case fix for RLIMIT_RTIME correctness.

 Thanks,

Ingo

-->
Brian Silverman (1):
  sched: Fix RLIMIT_RTTIME when PI-boosting to RT


 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..62671f53202a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3034,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
} else {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
+   if (rt_prio(oldprio))
+   p->rt.timeout = 0;
p->sched_class = _sched_class;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2015-03-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 746db9443ea57fd9c059f62c4bfbf41cf224fe13 sched: Fix RLIMIT_RTTIME 
when PI-boosting to RT

A single sched/rt corner case fix for RLIMIT_RTIME correctness.

 Thanks,

Ingo

--
Brian Silverman (1):
  sched: Fix RLIMIT_RTTIME when PI-boosting to RT


 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..62671f53202a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3034,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
} else {
if (dl_prio(oldprio))
p-dl.dl_boosted = 0;
+   if (rt_prio(oldprio))
+   p-rt.timeout = 0;
p-sched_class = fair_sched_class;
}
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2014-01-15 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 9722c2dac708e9468cc0dc30218ef76946ffbc9d sched: Calculate effective 
load even if local weight is 0

Contains a fix for a bug that manifested itself as a 3D performance 
regression.

 Thanks,

Ingo

-->
Rik van Riel (1):
  sched: Calculate effective load even if local weight is 0


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d9..e64b079 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3923,7 +3923,7 @@ static long effective_load(struct task_group *tg, int 
cpu, long wl, long wg)
 {
struct sched_entity *se = tg->se[cpu];
 
-   if (!tg->parent || !wl) /* the trivial, non-cgroup case */
+   if (!tg->parent)/* the trivial, non-cgroup case */
return wl;
 
for_each_sched_entity(se) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2014-01-15 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: 9722c2dac708e9468cc0dc30218ef76946ffbc9d sched: Calculate effective 
load even if local weight is 0

Contains a fix for a bug that manifested itself as a 3D performance 
regression.

 Thanks,

Ingo

--
Rik van Riel (1):
  sched: Calculate effective load even if local weight is 0


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d9..e64b079 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3923,7 +3923,7 @@ static long effective_load(struct task_group *tg, int 
cpu, long wl, long wg)
 {
struct sched_entity *se = tg-se[cpu];
 
-   if (!tg-parent || !wl) /* the trivial, non-cgroup case */
+   if (!tg-parent)/* the trivial, non-cgroup case */
return wl;
 
for_each_sched_entity(se) {
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2013-09-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: 62d08aec6a9f4b45cc9cba1e3b2855995df133e6 Merge branch 
'context_tracking/fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into 
sched/urgent

A context tracking ARM build and functional fix.

 Thanks,

Ingo

-->
Frederic Weisbecker (1):
  arm: Fix build error with context tracking calls


 arch/arm/kernel/entry-header.S |  8 
 kernel/context_tracking.c  | 12 
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index de23a9b..39f89fb 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -329,10 +329,10 @@
 #ifdef CONFIG_CONTEXT_TRACKING
.if \save
stmdb   sp!, {r0-r3, ip, lr}
-   bl  user_exit
+   bl  context_tracking_user_exit
ldmia   sp!, {r0-r3, ip, lr}
.else
-   bl  user_exit
+   bl  context_tracking_user_exit
.endif
 #endif
.endm
@@ -341,10 +341,10 @@
 #ifdef CONFIG_CONTEXT_TRACKING
.if \save
stmdb   sp!, {r0-r3, ip, lr}
-   bl  user_enter
+   bl  context_tracking_user_enter
ldmia   sp!, {r0-r3, ip, lr}
.else
-   bl  user_enter
+   bl  context_tracking_user_enter
.endif
 #endif
.endm
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 247091b..859c8df 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -51,6 +51,15 @@ void context_tracking_user_enter(void)
unsigned long flags;
 
/*
+* Repeat the user_enter() check here because some archs may be calling
+* this from asm and if no CPU needs context tracking, they shouldn't
+* go further. Repeat the check here until they support the static key
+* check.
+*/
+   if (!static_key_false(_tracking_enabled))
+   return;
+
+   /*
 * Some contexts may involve an exception occuring in an irq,
 * leading to that nesting:
 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -151,6 +160,9 @@ void context_tracking_user_exit(void)
 {
unsigned long flags;
 
+   if (!static_key_false(_tracking_enabled))
+   return;
+
if (in_interrupt())
return;
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2013-09-28 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: 62d08aec6a9f4b45cc9cba1e3b2855995df133e6 Merge branch 
'context_tracking/fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into 
sched/urgent

A context tracking ARM build and functional fix.

 Thanks,

Ingo

--
Frederic Weisbecker (1):
  arm: Fix build error with context tracking calls


 arch/arm/kernel/entry-header.S |  8 
 kernel/context_tracking.c  | 12 
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index de23a9b..39f89fb 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -329,10 +329,10 @@
 #ifdef CONFIG_CONTEXT_TRACKING
.if \save
stmdb   sp!, {r0-r3, ip, lr}
-   bl  user_exit
+   bl  context_tracking_user_exit
ldmia   sp!, {r0-r3, ip, lr}
.else
-   bl  user_exit
+   bl  context_tracking_user_exit
.endif
 #endif
.endm
@@ -341,10 +341,10 @@
 #ifdef CONFIG_CONTEXT_TRACKING
.if \save
stmdb   sp!, {r0-r3, ip, lr}
-   bl  user_enter
+   bl  context_tracking_user_enter
ldmia   sp!, {r0-r3, ip, lr}
.else
-   bl  user_enter
+   bl  context_tracking_user_enter
.endif
 #endif
.endm
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 247091b..859c8df 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -51,6 +51,15 @@ void context_tracking_user_enter(void)
unsigned long flags;
 
/*
+* Repeat the user_enter() check here because some archs may be calling
+* this from asm and if no CPU needs context tracking, they shouldn't
+* go further. Repeat the check here until they support the static key
+* check.
+*/
+   if (!static_key_false(context_tracking_enabled))
+   return;
+
+   /*
 * Some contexts may involve an exception occuring in an irq,
 * leading to that nesting:
 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -151,6 +160,9 @@ void context_tracking_user_exit(void)
 {
unsigned long flags;
 
+   if (!static_key_false(context_tracking_enabled))
+   return;
+
if (in_interrupt())
return;
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2013-09-12 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: b0cff9d88ce2f3030f73138078c5b1019f17e1cc sched: Fix load balancing 
performance regression in should_we_balance()

Performance regression fix.

 Thanks,

Ingo

-->
Joonsoo Kim (1):
  sched: Fix load balancing performance regression in should_we_balance()


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f0a5e6..9b3fe1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5151,7 +5151,7 @@ static int should_we_balance(struct lb_env *env)
 * First idle cpu or the first cpu(busiest) in this sched group
 * is eligible for doing load balancing at this and above domains.
 */
-   return balance_cpu != env->dst_cpu;
+   return balance_cpu == env->dst_cpu;
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] scheduler fix

2013-09-12 Thread Ingo Molnar

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: b0cff9d88ce2f3030f73138078c5b1019f17e1cc sched: Fix load balancing 
performance regression in should_we_balance()

Performance regression fix.

 Thanks,

Ingo

--
Joonsoo Kim (1):
  sched: Fix load balancing performance regression in should_we_balance()


 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f0a5e6..9b3fe1c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5151,7 +5151,7 @@ static int should_we_balance(struct lb_env *env)
 * First idle cpu or the first cpu(busiest) in this sched group
 * is eligible for doing load balancing at this and above domains.
 */
-   return balance_cpu != env-dst_cpu;
+   return balance_cpu == env-dst_cpu;
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Ingo Molnar


* Mike Galbraith  wrote:

> > Willing to write a changelog with the pointer to the actual 
> > oops that happens due to this issue?
> 
> I don't have a link, so reproduced/captured it.  With 
> systemd-sysvinit (bleh) installed, it's trivial to reproduce:
> 
> Add echo 0 > /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
> (or wherever), boot box, type reboot, box explodes.
> 
> revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled
> 
> Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both

Slightly decoded, for our human readers:

 8323f26ce342 ("sched: Fix race in task_group()")

:-)

> applied, all you have to do to crash a box is disable autogroup
> during boot up, then reboot.. boom, NULL pointer dereference due
> to 800d4d30 not allowing autogroup to move things, and 8323f26ce
> making that the only way to switch runqueues.
> 
> [  202.187747] BUG: unable to handle kernel NULL pointer dereference at   
> (null)
> [  202.191644] IP: [] effective_load.isra.43+0x50/0x90
> [  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
> [  202.191644] Oops:  [#1] SMP 
> [  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss 
> sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand 
> cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse 
> nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat 
> snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage 
> snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc 
> i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 
> ext4 mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata 
> rtc_cmos ehci_hcd scsi_mod thermal fan usbcore processor usb_common
> [  202.191644] CPU 0 
> [  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 
> MEDIONPC MS-7502/MS-7502
> [  202.191644] RIP: 0010:[]  [] 
> effective_load.isra.43+0x50/0x90
> [  202.191644] RSP: 0018:880221ddfbd8  EFLAGS: 00010086
> [  202.191644] RAX: 0400 RBX: 88022621d880 RCX: 
> 
> [  202.191644] RDX:  RSI: 0002 RDI: 
> 880220a363a0
> [  202.191644] RBP: 880221ddfbd8 R08: 0400 R09: 
> 000115c0
> [  202.191644] R10:  R11: 0400 R12: 
> 8802214ed180
> [  202.191644] R13: 03fd R14:  R15: 
> 0003
> [  202.191644] FS:  7f174a81c7a0() GS:88022fc0() 
> knlGS:
> [  202.191644] CS:  0010 DS:  ES:  CR0: 80050033
> [  202.191644] CR2:  CR3: 000221fad000 CR4: 
> 07f0
> [  202.191644] DR0:  DR1:  DR2: 
> 
> [  202.191644] DR3:  DR6: 0ff0 DR7: 
> 0400
> [  202.191644] Process systemd-user-se (pid: 7047, threadinfo 
> 880221dde000, task 88022618b3a0)
> [  202.191644] Stack:
> [  202.191644]  880221ddfc88 81063d55 0400 
> 000115c0
> [  202.191644]  88022235c218 814ef9e8 ea00 
> 88022621d880
> [  202.191644]  880227007200 0003 0010 
> 00018f38
> [  202.191644] Call Trace:
> [  202.191644]  [] select_task_rq_fair+0x255/0x780
> [  202.191644]  [] try_to_wake_up+0x156/0x2c0
> [  202.191644]  [] wake_up_state+0xb/0x10
> [  202.191644]  [] signal_wake_up+0x28/0x40
> [  202.191644]  [] complete_signal+0x1d6/0x250
> [  202.191644]  [] __send_signal+0x170/0x310
> [  202.191644]  [] send_signal+0x40/0x80
> [  202.191644]  [] do_send_sig_info+0x47/0x90
> [  202.191644]  [] group_send_sig_info+0x4a/0x70
> [  202.191644]  [] kill_pid_info+0x3a/0x60
> [  202.191644]  [] sys_kill+0x97/0x1a0
> [  202.191644]  [] ? vfs_read+0x120/0x160
> [  202.191644]  [] ? sys_read+0x45/0x90
> [  202.191644]  [] system_call_fastpath+0x16/0x1b
> [  202.191644] Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 
> 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 
> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c 
> [  202.191644] RIP  [] effective_load.isra.43+0x50/0x90
> [  202.191644]  RSP 
> [  202.191644] CR2: 
> 
> Signed-off-by: Mike Galbraith 
> Cc: Yong Zhang 
> Cc: sta...@vger.kernel.org

Thanks Mike!

Acked-by: Ingo Molnar 

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Mike Galbraith

On Sun, 2012-12-02 at 11:36 -0800, Linus Torvalds wrote: 
> On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar  wrote:
> >
> > * Mike Galbraith  wrote:
> >
> >> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
> >>
> >> > Should we use some other file for that - or no file at all and
> >> > just emit a bootup printk for kernel hackers with a short
> >> > attention span?
> >>
> >> Or, whack the file and don't bother with a printk either.  If
> >> it's in your config, and your command line doesn't contain
> >> noautogroup, it's on, so the info is already present (until
> >> buffer gets full).  That makes for even fewer lines dedicated
> >> to dinky sideline feature.
> >>
> >> Or (as previously mentioned) just depreciate (or rip out) the
> >> whole thing since systemd is propagating everywhere anyway,
> >> and offers the same functionality.
> >>
> >> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
> >> when folks play with the now non-functional on/off switch
> >> (task groups are required to _always_ exist, that commit
> >> busted the autogroup assumption), so is perhaps a viable
> >> quickfix until autogroups fate is decided?
> >
> > Linus, which one would be your preference? I'm fine with the
> > first and third options - #2 that rips it all out looks like
> > a sad removal of an otherwise useful feature.
> 
> I suspect #3 is the best option right now - just revert 800d4d30c8f2.
> 
> Willing to write a changelog with the pointer to the actual oops that
> happens due to this issue?

I don't have a link, so reproduced/captured it.  With systemd-sysvinit
(bleh) installed, it's trivial to reproduce:

Add echo 0 > /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
(or wherever), boot box, type reboot, box explodes.

revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled

Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both
applied, all you have to do to crash a box is disable autogroup
during boot up, then reboot.. boom, NULL pointer dereference due
to 800d4d30 not allowing autogroup to move things, and 8323f26ce
making that the only way to switch runqueues.

[  202.187747] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[  202.191644] IP: [] effective_load.isra.43+0x50/0x90
[  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
[  202.191644] Oops:  [#1] SMP 
[  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss 
sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand 
cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse 
nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat 
snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage 
snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc 
i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 ext4 
mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata rtc_cmos 
ehci_hcd scsi_mod thermal fan usbcore processor usb_common
[  202.191644] CPU 0 
[  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 
MEDIONPC MS-7502/MS-7502
[  202.191644] RIP: 0010:[]  [] 
effective_load.isra.43+0x50/0x90
[  202.191644] RSP: 0018:880221ddfbd8  EFLAGS: 00010086
[  202.191644] RAX: 0400 RBX: 88022621d880 RCX: 
[  202.191644] RDX:  RSI: 0002 RDI: 880220a363a0
[  202.191644] RBP: 880221ddfbd8 R08: 0400 R09: 000115c0
[  202.191644] R10:  R11: 0400 R12: 8802214ed180
[  202.191644] R13: 03fd R14:  R15: 0003
[  202.191644] FS:  7f174a81c7a0() GS:88022fc0() 
knlGS:
[  202.191644] CS:  0010 DS:  ES:  CR0: 80050033
[  202.191644] CR2:  CR3: 000221fad000 CR4: 07f0
[  202.191644] DR0:  DR1:  DR2: 
[  202.191644] DR3:  DR6: 0ff0 DR7: 0400
[  202.191644] Process systemd-user-se (pid: 7047, threadinfo 880221dde000, 
task 88022618b3a0)
[  202.191644] Stack:
[  202.191644]  880221ddfc88 81063d55 0400 
000115c0
[  202.191644]  88022235c218 814ef9e8 ea00 
88022621d880
[  202.191644]  880227007200 0003 0010 
00018f38
[  202.191644] Call Trace:
[  202.191644]  [] select_task_rq_fair+0x255/0x780
[  202.191644]  [] try_to_wake_up+0x156/0x2c0
[  202.191644]  [] wake_up_state+0xb/0x10
[  202.191644]  [] signal_wake_up+0x28/0x40
[  202.191644]  [] complete_signal+0x1d6/0x250
[  202.191644]  [] __send_signal+0x170/0x310
[  202.191644]  [] send_signal+0x40/0x80
[  202.191644]  [] do_send_sig_info+0x47/0x90
[  202.191644]  [] group_send_sig_info+0x4a/0x70
[  202.191644]  [] kill_pid_info+0x3a/0x60
[  202.191644]  [] sys_kill+0x97/0x1a0
[

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Linus Torvalds

On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar  wrote:
>
> * Mike Galbraith  wrote:
>
>> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
>>
>> > Should we use some other file for that - or no file at all and
>> > just emit a bootup printk for kernel hackers with a short
>> > attention span?
>>
>> Or, whack the file and don't bother with a printk either.  If
>> it's in your config, and your command line doesn't contain
>> noautogroup, it's on, so the info is already present (until
>> buffer gets full).  That makes for even fewer lines dedicated
>> to dinky sideline feature.
>>
>> Or (as previously mentioned) just depreciate (or rip out) the
>> whole thing since systemd is propagating everywhere anyway,
>> and offers the same functionality.
>>
>> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
>> when folks play with the now non-functional on/off switch
>> (task groups are required to _always_ exist, that commit
>> busted the autogroup assumption), so is perhaps a viable
>> quickfix until autogroups fate is decided?
>
> Linus, which one would be your preference? I'm fine with the
> first and third options - #2 that rips it all out looks like
> a sad removal of an otherwise useful feature.

I suspect #3 is the best option right now - just revert 800d4d30c8f2.

Willing to write a changelog with the pointer to the actual oops that
happens due to this issue?

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Ingo Molnar


* Mike Galbraith  wrote:

> On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
> 
> > Should we use some other file for that - or no file at all and 
> > just emit a bootup printk for kernel hackers with a short 
> > attention span?
> 
> Or, whack the file and don't bother with a printk either.  If 
> it's in your config, and your command line doesn't contain 
> noautogroup, it's on, so the info is already present (until 
> buffer gets full).  That makes for even fewer lines dedicated 
> to dinky sideline feature.
> 
> Or (as previously mentioned) just depreciate (or rip out) the 
> whole thing since systemd is propagating everywhere anyway, 
> and offers the same functionality.
> 
> For 3.7, a revert of 800d4d30c8f2 would prevent the explosion 
> when folks play with the now non-functional on/off switch 
> (task groups are required to _always_ exist, that commit 
> busted the autogroup assumption), so is perhaps a viable 
> quickfix until autogroups fate is decided?

Linus, which one would be your preference? I'm fine with the 
first and third options - #2 that rips it all out looks like
a sad removal of an otherwise useful feature.

( The fourth option would be to fix the dynamic knobs - there's 
  no patch for that yet. )

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Mike Galbraith

On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:

> Should we use some other file for that - or no file at all and 
> just emit a bootup printk for kernel hackers with a short 
> attention span?

Or, whack the file and don't bother with a printk either.  If it's in
your config, and your command line doesn't contain noautogroup, it's on,
so the info is already present (until buffer gets full).  That makes for
even fewer lines dedicated to dinky sideline feature.

Or (as previously mentioned) just depreciate (or rip out) the whole
thing since systemd is propagating everywhere anyway, and offers the
same functionality.

For 3.7, a revert of 800d4d30c8f2 would prevent the explosion when folks
play with the now non-functional on/off switch (task groups are required
to _always_ exist, that commit busted the autogroup assumption), so is
perhaps a viable quickfix until autogroups fate is decided?

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Mike Galbraith

On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:

 Should we use some other file for that - or no file at all and 
 just emit a bootup printk for kernel hackers with a short 
 attention span?

Or, whack the file and don't bother with a printk either.  If it's in
your config, and your command line doesn't contain noautogroup, it's on,
so the info is already present (until buffer gets full).  That makes for
even fewer lines dedicated to dinky sideline feature.

Or (as previously mentioned) just depreciate (or rip out) the whole
thing since systemd is propagating everywhere anyway, and offers the
same functionality.

For 3.7, a revert of 800d4d30c8f2 would prevent the explosion when folks
play with the now non-functional on/off switch (task groups are required
to _always_ exist, that commit busted the autogroup assumption), so is
perhaps a viable quickfix until autogroups fate is decided?

-Mike

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Ingo Molnar


* Mike Galbraith efa...@gmx.de wrote:

 On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
 
  Should we use some other file for that - or no file at all and 
  just emit a bootup printk for kernel hackers with a short 
  attention span?
 
 Or, whack the file and don't bother with a printk either.  If 
 it's in your config, and your command line doesn't contain 
 noautogroup, it's on, so the info is already present (until 
 buffer gets full).  That makes for even fewer lines dedicated 
 to dinky sideline feature.
 
 Or (as previously mentioned) just depreciate (or rip out) the 
 whole thing since systemd is propagating everywhere anyway, 
 and offers the same functionality.
 
 For 3.7, a revert of 800d4d30c8f2 would prevent the explosion 
 when folks play with the now non-functional on/off switch 
 (task groups are required to _always_ exist, that commit 
 busted the autogroup assumption), so is perhaps a viable 
 quickfix until autogroups fate is decided?

Linus, which one would be your preference? I'm fine with the 
first and third options - #2 that rips it all out looks like
a sad removal of an otherwise useful feature.

( The fourth option would be to fix the dynamic knobs - there's 
  no patch for that yet. )

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Linus Torvalds

On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar mi...@kernel.org wrote:

 * Mike Galbraith efa...@gmx.de wrote:

 On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:

  Should we use some other file for that - or no file at all and
  just emit a bootup printk for kernel hackers with a short
  attention span?

 Or, whack the file and don't bother with a printk either.  If
 it's in your config, and your command line doesn't contain
 noautogroup, it's on, so the info is already present (until
 buffer gets full).  That makes for even fewer lines dedicated
 to dinky sideline feature.

 Or (as previously mentioned) just depreciate (or rip out) the
 whole thing since systemd is propagating everywhere anyway,
 and offers the same functionality.

 For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
 when folks play with the now non-functional on/off switch
 (task groups are required to _always_ exist, that commit
 busted the autogroup assumption), so is perhaps a viable
 quickfix until autogroups fate is decided?

 Linus, which one would be your preference? I'm fine with the
 first and third options - #2 that rips it all out looks like
 a sad removal of an otherwise useful feature.

I suspect #3 is the best option right now - just revert 800d4d30c8f2.

Willing to write a changelog with the pointer to the actual oops that
happens due to this issue?

   Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Mike Galbraith

On Sun, 2012-12-02 at 11:36 -0800, Linus Torvalds wrote: 
 On Sun, Dec 2, 2012 at 11:27 AM, Ingo Molnar mi...@kernel.org wrote:
 
  * Mike Galbraith efa...@gmx.de wrote:
 
  On Sat, 2012-12-01 at 22:44 +0100, Ingo Molnar wrote:
 
   Should we use some other file for that - or no file at all and
   just emit a bootup printk for kernel hackers with a short
   attention span?
 
  Or, whack the file and don't bother with a printk either.  If
  it's in your config, and your command line doesn't contain
  noautogroup, it's on, so the info is already present (until
  buffer gets full).  That makes for even fewer lines dedicated
  to dinky sideline feature.
 
  Or (as previously mentioned) just depreciate (or rip out) the
  whole thing since systemd is propagating everywhere anyway,
  and offers the same functionality.
 
  For 3.7, a revert of 800d4d30c8f2 would prevent the explosion
  when folks play with the now non-functional on/off switch
  (task groups are required to _always_ exist, that commit
  busted the autogroup assumption), so is perhaps a viable
  quickfix until autogroups fate is decided?
 
  Linus, which one would be your preference? I'm fine with the
  first and third options - #2 that rips it all out looks like
  a sad removal of an otherwise useful feature.
 
 I suspect #3 is the best option right now - just revert 800d4d30c8f2.
 
 Willing to write a changelog with the pointer to the actual oops that
 happens due to this issue?

I don't have a link, so reproduced/captured it.  With systemd-sysvinit
(bleh) installed, it's trivial to reproduce:

Add echo 0  /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
(or wherever), boot box, type reboot, box explodes.

revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled

Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both
applied, all you have to do to crash a box is disable autogroup
during boot up, then reboot.. boom, NULL pointer dereference due
to 800d4d30 not allowing autogroup to move things, and 8323f26ce
making that the only way to switch runqueues.

[  202.187747] BUG: unable to handle kernel NULL pointer dereference at 
  (null)
[  202.191644] IP: [81063ac0] effective_load.isra.43+0x50/0x90
[  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
[  202.191644] Oops:  [#1] SMP 
[  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss 
sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand 
cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse 
nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat 
snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage 
snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc 
i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 ext4 
mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata rtc_cmos 
ehci_hcd scsi_mod thermal fan usbcore processor usb_common
[  202.191644] CPU 0 
[  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 
MEDIONPC MS-7502/MS-7502
[  202.191644] RIP: 0010:[81063ac0]  [81063ac0] 
effective_load.isra.43+0x50/0x90
[  202.191644] RSP: 0018:880221ddfbd8  EFLAGS: 00010086
[  202.191644] RAX: 0400 RBX: 88022621d880 RCX: 
[  202.191644] RDX:  RSI: 0002 RDI: 880220a363a0
[  202.191644] RBP: 880221ddfbd8 R08: 0400 R09: 000115c0
[  202.191644] R10:  R11: 0400 R12: 8802214ed180
[  202.191644] R13: 03fd R14:  R15: 0003
[  202.191644] FS:  7f174a81c7a0() GS:88022fc0() 
knlGS:
[  202.191644] CS:  0010 DS:  ES:  CR0: 80050033
[  202.191644] CR2:  CR3: 000221fad000 CR4: 07f0
[  202.191644] DR0:  DR1:  DR2: 
[  202.191644] DR3:  DR6: 0ff0 DR7: 0400
[  202.191644] Process systemd-user-se (pid: 7047, threadinfo 880221dde000, 
task 88022618b3a0)
[  202.191644] Stack:
[  202.191644]  880221ddfc88 81063d55 0400 
000115c0
[  202.191644]  88022235c218 814ef9e8 ea00 
88022621d880
[  202.191644]  880227007200 0003 0010 
00018f38
[  202.191644] Call Trace:
[  202.191644]  [81063d55] select_task_rq_fair+0x255/0x780
[  202.191644]  [810607e6] try_to_wake_up+0x156/0x2c0
[  202.191644]  [8106098b] wake_up_state+0xb/0x10
[  202.191644]  [81044f88] signal_wake_up+0x28/0x40
[  202.191644]  [81045406] complete_signal+0x1d6/0x250
[  202.191644]  [810455f0] __send_signal+0x170/0x310
[  202.191644]  [810457d0] send_signal+0x40/0x80
[  202.191644]  [81046257] do_send_sig_info+0x47/0x90
[  202.191644]

Re: [patch] Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-02 Thread Ingo Molnar


* Mike Galbraith efa...@gmx.de wrote:

  Willing to write a changelog with the pointer to the actual 
  oops that happens due to this issue?
 
 I don't have a link, so reproduced/captured it.  With 
 systemd-sysvinit (bleh) installed, it's trivial to reproduce:
 
 Add echo 0  /proc/sys/kernel/sched_autogroup_enabled to /root/.bashrc
 (or wherever), boot box, type reboot, box explodes.
 
 revert 800d4d30 sched, autogroup: Stop going ahead if autogroup is disabled
 
 Between 8323f26ce and 800d4d30, autogroup is a wreck.  With both

Slightly decoded, for our human readers:

 8323f26ce342 (sched: Fix race in task_group())

:-)

 applied, all you have to do to crash a box is disable autogroup
 during boot up, then reboot.. boom, NULL pointer dereference due
 to 800d4d30 not allowing autogroup to move things, and 8323f26ce
 making that the only way to switch runqueues.
 
 [  202.187747] BUG: unable to handle kernel NULL pointer dereference at   
 (null)
 [  202.191644] IP: [81063ac0] effective_load.isra.43+0x50/0x90
 [  202.191644] PGD 220a74067 PUD 220402067 PMD 0 
 [  202.191644] Oops:  [#1] SMP 
 [  202.191644] Modules linked in: nfs nfsd fscache lockd nfs_acl auth_rpcgss 
 sunrpc exportfs bridge stp cpufreq_conservative cpufreq_ondemand 
 cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd fuse 
 nls_iso8859_1 snd_hda_codec_realtek nls_cp437 snd_hda_intel vfat fat 
 snd_hda_codec e1000e sr_mod snd_hwdep cdrom snd_pcm sg snd_timer usb_storage 
 snd firewire_ohci usb_libusual firewire_core soundcore uas snd_page_alloc 
 i2c_i801 coretemp edd microcode hid_generic button crc_itu_t ipv6 autofs4 
 ext4 mbcache jbd2 crc16 usbhid hid sd_mod uhci_hcd ahci libahci libata 
 rtc_cmos ehci_hcd scsi_mod thermal fan usbcore processor usb_common
 [  202.191644] CPU 0 
 [  202.191644] Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 
 MEDIONPC MS-7502/MS-7502
 [  202.191644] RIP: 0010:[81063ac0]  [81063ac0] 
 effective_load.isra.43+0x50/0x90
 [  202.191644] RSP: 0018:880221ddfbd8  EFLAGS: 00010086
 [  202.191644] RAX: 0400 RBX: 88022621d880 RCX: 
 
 [  202.191644] RDX:  RSI: 0002 RDI: 
 880220a363a0
 [  202.191644] RBP: 880221ddfbd8 R08: 0400 R09: 
 000115c0
 [  202.191644] R10:  R11: 0400 R12: 
 8802214ed180
 [  202.191644] R13: 03fd R14:  R15: 
 0003
 [  202.191644] FS:  7f174a81c7a0() GS:88022fc0() 
 knlGS:
 [  202.191644] CS:  0010 DS:  ES:  CR0: 80050033
 [  202.191644] CR2:  CR3: 000221fad000 CR4: 
 07f0
 [  202.191644] DR0:  DR1:  DR2: 
 
 [  202.191644] DR3:  DR6: 0ff0 DR7: 
 0400
 [  202.191644] Process systemd-user-se (pid: 7047, threadinfo 
 880221dde000, task 88022618b3a0)
 [  202.191644] Stack:
 [  202.191644]  880221ddfc88 81063d55 0400 
 000115c0
 [  202.191644]  88022235c218 814ef9e8 ea00 
 88022621d880
 [  202.191644]  880227007200 0003 0010 
 00018f38
 [  202.191644] Call Trace:
 [  202.191644]  [81063d55] select_task_rq_fair+0x255/0x780
 [  202.191644]  [810607e6] try_to_wake_up+0x156/0x2c0
 [  202.191644]  [8106098b] wake_up_state+0xb/0x10
 [  202.191644]  [81044f88] signal_wake_up+0x28/0x40
 [  202.191644]  [81045406] complete_signal+0x1d6/0x250
 [  202.191644]  [810455f0] __send_signal+0x170/0x310
 [  202.191644]  [810457d0] send_signal+0x40/0x80
 [  202.191644]  [81046257] do_send_sig_info+0x47/0x90
 [  202.191644]  [8104649a] group_send_sig_info+0x4a/0x70
 [  202.191644]  [810465ba] kill_pid_info+0x3a/0x60
 [  202.191644]  [81047ac7] sys_kill+0x97/0x1a0
 [  202.191644]  [810ebc10] ? vfs_read+0x120/0x160
 [  202.191644]  [810ebc95] ? sys_read+0x45/0x90
 [  202.191644]  [8134bde2] system_call_fastpath+0x16/0x1b
 [  202.191644] Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 
 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 48 
 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c 
 [  202.191644] RIP  [81063ac0] effective_load.isra.43+0x50/0x90
 [  202.191644]  RSP 880221ddfbd8
 [  202.191644] CR2: 
 
 Signed-off-by: Mike Galbraith efa...@gmx.de
 Cc: Yong Zhang yong.zha...@gmail.com
 Cc: sta...@vger.kernel.org

Thanks Mike!

Acked-by: Ingo Molnar mi...@kernel.org

Ingo
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Mike Galbraith

On Sat, 2012-12-01 at 14:03 -0800, Linus Torvalds wrote: 
> On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar  wrote:
> >
> > You are not missing anything. That flag is my fault not Mike's:
> > I booted the initial version of that patch but was unsure
> > whether autogroups was enabled - it's a pretty transparent
> > feature. So I figured that having that flag (but readonly) would
> > give us this information definitely.
> 
> So what's the advantage of it being read-only at all?
> 
> Since the flag is clearly *used*, make it read-write, and then all my
> objections go away (except for a slight worry that the dropping of
> /proc//autogroup_nice or whatever it is could break some odd
> system app, but I don't worry *too* much about that).
> 
> Disabling autogroup is clearly something people might want, since the
> code tests for it. So removing the flag entirely seems wrong too. But
> if it exists, it should be writable. No?

No, because turning autogroup off at runtime is what now makes boom.

With Peter's race fix in place, lazy movement (was noop on UP) is gone,
mandating that you either walk the box, moving all tasks when you flick
the switch, or you remove the ability to flick the switch other than
'off' at boot time.  You didn't like the original instant on/off switch,
and I didn't like the thought of making autogroup bigger to fix the
explosion either, so went for rip the switch and problematic /proc stuff
that really should have never existed in the first place out option.

-Mike

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Linus Torvalds

On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar  wrote:
>
> You are not missing anything. That flag is my fault not Mike's:
> I booted the initial version of that patch but was unsure
> whether autogroups was enabled - it's a pretty transparent
> feature. So I figured that having that flag (but readonly) would
> give us this information definitely.

So what's the advantage of it being read-only at all?

Since the flag is clearly *used*, make it read-write, and then all my
objections go away (except for a slight worry that the dropping of
/proc//autogroup_nice or whatever it is could break some odd
system app, but I don't worry *too* much about that).

Disabling autogroup is clearly something people might want, since the
code tests for it. So removing the flag entirely seems wrong too. But
if it exists, it should be writable. No?

 Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar


* Ingo Molnar  wrote:

> You are not missing anything. That flag is my fault not 
> Mike's: I booted the initial version of that patch but was 
> unsure whether autogroups was enabled - it's a pretty 
> transparent feature. So I figured that having that flag (but 
> readonly) would give us this information definitely.

The other reason was that the original version of the patch also 
added a boot parameter - to enable/disable autogroups from the 
boot command line. With *that* configuration twist it made sense 
to present this information somewhere in /proc as well.

But then we got rid of the boot parameter to simplify the patch 
- which further reduced the sense of the 
/proc/sys/kernel/sched_autogroup_enabled flag - which now can 
only ever be 1.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar

* Linus Torvalds  wrote:

> On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar  wrote:
> >
> > Please [RFC] pull the latest sched-urgent-for-linus git tree
> > from:
> 
> No. That patch is braindead. I wouldn't pull it even if it 
> wasn't this late.
> 
> Why the hell leave a read-only 'sched_autogroup_enabled' proc 
> file?
>
> What the f*ck is the point? It looks like the flag still 
> exists (we test it), but now there's no point to it, since you 
> can't change it.
> 
> What am I missing?

You are not missing anything. That flag is my fault not Mike's: 
I booted the initial version of that patch but was unsure 
whether autogroups was enabled - it's a pretty transparent 
feature. So I figured that having that flag (but readonly) would 
give us this information definitely.

So I suggested to Mike to keep that flag so that user-space is 
informed that autogroups is enabled. It seemed like a cute 
usability twist at that time, and there's existing precedent for 
it in /proc, but now I'm not so sure anymore...

Should we use some other file for that - or no file at all and 
just emit a bootup printk for kernel hackers with a short 
attention span?

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Linus Torvalds

On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar  wrote:
>
> Please [RFC] pull the latest sched-urgent-for-linus git tree
> from:

No. That patch is braindead. I wouldn't pull it even if it wasn't this late.

Why the hell leave a read-only 'sched_autogroup_enabled' proc file?
What the f*ck is the point? It looks like the flag still exists (we
test it), but now there's no point to it, since you can't change it.

What am I missing?

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar

Linus,

Please [RFC] pull the latest sched-urgent-for-linus git tree 
from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: 5258f386ea4e8454bc801fb443e8a4217da1947c sched/autogroup: Fix crash on 
reboot when autogroup is disabled

Larger and scarier than what I'd like to have so I marked it 
[RFC], but the autogroup fix looks important enough - it's 
enabled in a number of distros.

[ The other reason for the RFC: unfortunately the new feature 
  flag change "sched: Add WAKEUP_PREEMPTION feature flag, on by 
  default" got added weeks ago and got stuck below the autogroup 
  fix - I can rebase and untangle it if it's a problem. (It is 
  not expected to cause any change in behavior or problems.) ]

Thanks,

Ingo

-->
Ingo Molnar (1):
  sched: Add WAKEUP_PREEMPTION feature flag, on by default

Mike Galbraith (1):
  sched/autogroup: Fix crash on reboot when autogroup is disabled


 fs/proc/base.c| 78 ---
 kernel/sched/auto_group.c | 68 +++--
 kernel/sched/auto_group.h |  9 +-
 kernel/sched/fair.c   |  2 +-
 kernel/sched/features.h   |  5 +++
 kernel/sysctl.c   |  6 ++--
 6 files changed, 20 insertions(+), 148 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84c..bb1d962 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1271,81 +1271,6 @@ static const struct file_operations 
proc_pid_sched_operations = {
 
 #endif
 
-#ifdef CONFIG_SCHED_AUTOGROUP
-/*
- * Print out autogroup related information:
- */
-static int sched_autogroup_show(struct seq_file *m, void *v)
-{
-   struct inode *inode = m->private;
-   struct task_struct *p;
-
-   p = get_proc_task(inode);
-   if (!p)
-   return -ESRCH;
-   proc_sched_autogroup_show_task(p, m);
-
-   put_task_struct(p);
-
-   return 0;
-}
-
-static ssize_t
-sched_autogroup_write(struct file *file, const char __user *buf,
-   size_t count, loff_t *offset)
-{
-   struct inode *inode = file->f_path.dentry->d_inode;
-   struct task_struct *p;
-   char buffer[PROC_NUMBUF];
-   int nice;
-   int err;
-
-   memset(buffer, 0, sizeof(buffer));
-   if (count > sizeof(buffer) - 1)
-   count = sizeof(buffer) - 1;
-   if (copy_from_user(buffer, buf, count))
-   return -EFAULT;
-
-   err = kstrtoint(strstrip(buffer), 0, );
-   if (err < 0)
-   return err;
-
-   p = get_proc_task(inode);
-   if (!p)
-   return -ESRCH;
-
-   err = proc_sched_autogroup_set_nice(p, nice);
-   if (err)
-   count = err;
-
-   put_task_struct(p);
-
-   return count;
-}
-
-static int sched_autogroup_open(struct inode *inode, struct file *filp)
-{
-   int ret;
-
-   ret = single_open(filp, sched_autogroup_show, NULL);
-   if (!ret) {
-   struct seq_file *m = filp->private_data;
-
-   m->private = inode;
-   }
-   return ret;
-}
-
-static const struct file_operations proc_pid_sched_autogroup_operations = {
-   .open   = sched_autogroup_open,
-   .read   = seq_read,
-   .write  = sched_autogroup_write,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
-
 static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
 {
@@ -3036,9 +2961,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
REG("sched",  S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
-#ifdef CONFIG_SCHED_AUTOGROUP
-   REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
-#endif
REG("comm",  S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall",S_IRUGO, proc_pid_syscall),
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..0f1bacb 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -110,6 +110,9 @@ out_fail:
 
 bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
+   if (!sysctl_sched_autogroup_enabled)
+   return false;
+
if (tg != _task_group)
return false;
 
@@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct 
autogroup *ag)
 
p->signal->autogroup = autogroup_kref_get(ag);
 
-   if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-   goto out;
-
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
 
-out:
unlock_task_sighand(p, );
autogroup_kref_put(prev);
 }
@@ -159,8 +158,11 @@ out:
 /* Allocates GFP_KERNEL, cannot be called under any spinlock */
 void sched_autogroup_create_attach(struct task_struct

[RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar

Linus,

Please [RFC] pull the latest sched-urgent-for-linus git tree 
from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   HEAD: 5258f386ea4e8454bc801fb443e8a4217da1947c sched/autogroup: Fix crash on 
reboot when autogroup is disabled

Larger and scarier than what I'd like to have so I marked it 
[RFC], but the autogroup fix looks important enough - it's 
enabled in a number of distros.

[ The other reason for the RFC: unfortunately the new feature 
  flag change sched: Add WAKEUP_PREEMPTION feature flag, on by 
  default got added weeks ago and got stuck below the autogroup 
  fix - I can rebase and untangle it if it's a problem. (It is 
  not expected to cause any change in behavior or problems.) ]

Thanks,

Ingo

--
Ingo Molnar (1):
  sched: Add WAKEUP_PREEMPTION feature flag, on by default

Mike Galbraith (1):
  sched/autogroup: Fix crash on reboot when autogroup is disabled


 fs/proc/base.c| 78 ---
 kernel/sched/auto_group.c | 68 +++--
 kernel/sched/auto_group.h |  9 +-
 kernel/sched/fair.c   |  2 +-
 kernel/sched/features.h   |  5 +++
 kernel/sysctl.c   |  6 ++--
 6 files changed, 20 insertions(+), 148 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84c..bb1d962 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1271,81 +1271,6 @@ static const struct file_operations 
proc_pid_sched_operations = {
 
 #endif
 
-#ifdef CONFIG_SCHED_AUTOGROUP
-/*
- * Print out autogroup related information:
- */
-static int sched_autogroup_show(struct seq_file *m, void *v)
-{
-   struct inode *inode = m-private;
-   struct task_struct *p;
-
-   p = get_proc_task(inode);
-   if (!p)
-   return -ESRCH;
-   proc_sched_autogroup_show_task(p, m);
-
-   put_task_struct(p);
-
-   return 0;
-}
-
-static ssize_t
-sched_autogroup_write(struct file *file, const char __user *buf,
-   size_t count, loff_t *offset)
-{
-   struct inode *inode = file-f_path.dentry-d_inode;
-   struct task_struct *p;
-   char buffer[PROC_NUMBUF];
-   int nice;
-   int err;
-
-   memset(buffer, 0, sizeof(buffer));
-   if (count  sizeof(buffer) - 1)
-   count = sizeof(buffer) - 1;
-   if (copy_from_user(buffer, buf, count))
-   return -EFAULT;
-
-   err = kstrtoint(strstrip(buffer), 0, nice);
-   if (err  0)
-   return err;
-
-   p = get_proc_task(inode);
-   if (!p)
-   return -ESRCH;
-
-   err = proc_sched_autogroup_set_nice(p, nice);
-   if (err)
-   count = err;
-
-   put_task_struct(p);
-
-   return count;
-}
-
-static int sched_autogroup_open(struct inode *inode, struct file *filp)
-{
-   int ret;
-
-   ret = single_open(filp, sched_autogroup_show, NULL);
-   if (!ret) {
-   struct seq_file *m = filp-private_data;
-
-   m-private = inode;
-   }
-   return ret;
-}
-
-static const struct file_operations proc_pid_sched_autogroup_operations = {
-   .open   = sched_autogroup_open,
-   .read   = seq_read,
-   .write  = sched_autogroup_write,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
-
 static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
 {
@@ -3036,9 +2961,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
REG(sched,  S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
-#ifdef CONFIG_SCHED_AUTOGROUP
-   REG(autogroup,  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
-#endif
REG(comm,  S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF(syscall,S_IRUGO, proc_pid_syscall),
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21..0f1bacb 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -110,6 +110,9 @@ out_fail:
 
 bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
+   if (!sysctl_sched_autogroup_enabled)
+   return false;
+
if (tg != root_task_group)
return false;
 
@@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct 
autogroup *ag)
 
p-signal-autogroup = autogroup_kref_get(ag);
 
-   if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-   goto out;
-
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
 
-out:
unlock_task_sighand(p, flags);
autogroup_kref_put(prev);
 }
@@ -159,8 +158,11 @@ out:
 /* Allocates GFP_KERNEL, cannot be called under any spinlock */
 void sched_autogroup_create_attach(struct task_struct *p)
 {
-

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Linus Torvalds

On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar mi...@kernel.org wrote:

 Please [RFC] pull the latest sched-urgent-for-linus git tree
 from:

No. That patch is braindead. I wouldn't pull it even if it wasn't this late.

Why the hell leave a read-only 'sched_autogroup_enabled' proc file?
What the f*ck is the point? It looks like the flag still exists (we
test it), but now there's no point to it, since you can't change it.

What am I missing?

Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar


* Linus Torvalds torva...@linux-foundation.org wrote:

 On Sat, Dec 1, 2012 at 3:16 AM, Ingo Molnar mi...@kernel.org wrote:
 
  Please [RFC] pull the latest sched-urgent-for-linus git tree
  from:
 
 No. That patch is braindead. I wouldn't pull it even if it 
 wasn't this late.
 
 Why the hell leave a read-only 'sched_autogroup_enabled' proc 
 file?

 What the f*ck is the point? It looks like the flag still 
 exists (we test it), but now there's no point to it, since you 
 can't change it.
 
 What am I missing?

You are not missing anything. That flag is my fault not Mike's: 
I booted the initial version of that patch but was unsure 
whether autogroups was enabled - it's a pretty transparent 
feature. So I figured that having that flag (but readonly) would 
give us this information definitely.

So I suggested to Mike to keep that flag so that user-space is 
informed that autogroups is enabled. It seemed like a cute 
usability twist at that time, and there's existing precedent for 
it in /proc, but now I'm not so sure anymore...

Should we use some other file for that - or no file at all and 
just emit a bootup printk for kernel hackers with a short 
attention span?

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Ingo Molnar


* Ingo Molnar mi...@kernel.org wrote:

 You are not missing anything. That flag is my fault not 
 Mike's: I booted the initial version of that patch but was 
 unsure whether autogroups was enabled - it's a pretty 
 transparent feature. So I figured that having that flag (but 
 readonly) would give us this information definitely.

The other reason was that the original version of the patch also 
added a boot parameter - to enable/disable autogroups from the 
boot command line. With *that* configuration twist it made sense 
to present this information somewhere in /proc as well.

But then we got rid of the boot parameter to simplify the patch 
- which further reduced the sense of the 
/proc/sys/kernel/sched_autogroup_enabled flag - which now can 
only ever be 1.

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Linus Torvalds

On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar mi...@kernel.org wrote:

 You are not missing anything. That flag is my fault not Mike's:
 I booted the initial version of that patch but was unsure
 whether autogroups was enabled - it's a pretty transparent
 feature. So I figured that having that flag (but readonly) would
 give us this information definitely.

So what's the advantage of it being read-only at all?

Since the flag is clearly *used*, make it read-write, and then all my
objections go away (except for a slight worry that the dropping of
/proc/pid/autogroup_nice or whatever it is could break some odd
system app, but I don't worry *too* much about that).

Disabling autogroup is clearly something people might want, since the
code tests for it. So removing the flag entirely seems wrong too. But
if it exists, it should be writable. No?

 Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC GIT PULL] scheduler fix for autogroups

2012-12-01 Thread Mike Galbraith

On Sat, 2012-12-01 at 14:03 -0800, Linus Torvalds wrote: 
 On Sat, Dec 1, 2012 at 1:44 PM, Ingo Molnar mi...@kernel.org wrote:
 
  You are not missing anything. That flag is my fault not Mike's:
  I booted the initial version of that patch but was unsure
  whether autogroups was enabled - it's a pretty transparent
  feature. So I figured that having that flag (but readonly) would
  give us this information definitely.
 
 So what's the advantage of it being read-only at all?
 
 Since the flag is clearly *used*, make it read-write, and then all my
 objections go away (except for a slight worry that the dropping of
 /proc/pid/autogroup_nice or whatever it is could break some odd
 system app, but I don't worry *too* much about that).
 
 Disabling autogroup is clearly something people might want, since the
 code tests for it. So removing the flag entirely seems wrong too. But
 if it exists, it should be writable. No?

No, because turning autogroup off at runtime is what now makes boom.

With Peter's race fix in place, lazy movement (was noop on UP) is gone,
mandating that you either walk the box, moving all tasks when you flick
the switch, or you remove the ability to flick the switch other than
'off' at boot time.  You didn't like the original instant on/off switch,
and I didn't like the thought of making autogroup bigger to fix the
explosion either, so went for rip the switch and problematic /proc stuff
that really should have never existed in the first place out option.

-Mike

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] scheduler fix

2008-01-22 Thread Ingo Molnar


Linus, please pull the latest scheduler-fixes git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

it's a fix for a late-breaking bug: if the root user / admin sets the 
new /sys/uids/*/cpu_share tunable too low from the default 1024 then we 
can crash/hang. [ in sched-devel.git we've already had MIN_GROUP_SHARES 
for a long time to enforce this limit - but it was not backported. ]

Ingo

-->
Ingo Molnar (1):
  sched: group scheduler, set uid share fix

 sched.c |8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 37cf07a..e76b11c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7153,6 +7153,14 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
 {
int i;
 
+   /*
+* A weight of 0 or 1 can cause arithmetics problems.
+* (The default weight is 1024 - so there's no practical
+*  limitation from this.)
+*/
+   if (shares < 2)
+   shares = 2;
+
spin_lock(>lock);
if (tg->shares == shares)
goto done;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] scheduler fix

2008-01-22 Thread Ingo Molnar


Linus, please pull the latest scheduler-fixes git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

it's a fix for a late-breaking bug: if the root user / admin sets the 
new /sys/uids/*/cpu_share tunable too low from the default 1024 then we 
can crash/hang. [ in sched-devel.git we've already had MIN_GROUP_SHARES 
for a long time to enforce this limit - but it was not backported. ]

Ingo

--
Ingo Molnar (1):
  sched: group scheduler, set uid share fix

 sched.c |8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 37cf07a..e76b11c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7153,6 +7153,14 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
 {
int i;
 
+   /*
+* A weight of 0 or 1 can cause arithmetics problems.
+* (The default weight is 1024 - so there's no practical
+*  limitation from this.)
+*/
+   if (shares  2)
+   shares = 2;
+
spin_lock(tg-lock);
if (tg-shares == shares)
goto done;
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [git pull] scheduler fix

2007-11-01 Thread Ingo Molnar


* Guillaume Chazarain <[EMAIL PROTECTED]> wrote:

> 2007/10/30, Ingo Molnar <[EMAIL PROTECTED]>:
> >  fs/proc/array.c   |3 ++-
> >  include/linux/sched.h |2 +-
> >  kernel/fork.c |1 +
> >  3 files changed, 4 insertions(+), 2 deletions(-)
> 
> Hello Ingo,
> 
> do you think it would be possible to include the patch in your git 
> pull request emails? Especially when the patch is small like this one.

yeah, will do that in the future.

Ingo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [git pull] scheduler fix

2007-11-01 Thread Ingo Molnar


* Guillaume Chazarain [EMAIL PROTECTED] wrote:

 2007/10/30, Ingo Molnar [EMAIL PROTECTED]:
   fs/proc/array.c   |3 ++-
   include/linux/sched.h |2 +-
   kernel/fork.c |1 +
   3 files changed, 4 insertions(+), 2 deletions(-)
 
 Hello Ingo,
 
 do you think it would be possible to include the patch in your git 
 pull request emails? Especially when the patch is small like this one.

yeah, will do that in the future.

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [git pull] scheduler fix

2007-10-30 Thread Guillaume Chazarain

2007/10/30, Ingo Molnar <[EMAIL PROTECTED]>:
>  fs/proc/array.c   |3 ++-
>  include/linux/sched.h |2 +-
>  kernel/fork.c |1 +
>  3 files changed, 4 insertions(+), 2 deletions(-)

Hello Ingo,

do you think it would be possible to include the patch in your git
pull request emails? Especially when the patch is small like this one.

Jeff showed a suitable script in
http://www.uwsg.iu.edu/hypermail/linux/kernel/0710.1/2218.html

Thanks.

-- 
Guillaume
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [git pull] scheduler fix

2007-10-30 Thread Guillaume Chazarain

2007/10/30, Ingo Molnar [EMAIL PROTECTED]:
  fs/proc/array.c   |3 ++-
  include/linux/sched.h |2 +-
  kernel/fork.c |1 +
  3 files changed, 4 insertions(+), 2 deletions(-)

Hello Ingo,

do you think it would be possible to include the patch in your git
pull request emails? Especially when the patch is small like this one.

Jeff showed a suitable script in
http://www.uwsg.iu.edu/hypermail/linux/kernel/0710.1/2218.html

Thanks.

-- 
Guillaume
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] scheduler fix

2007-10-29 Thread Ingo Molnar


Linus, this is a followup git pull request for a single fix:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Frans Pop has tested a fix from Balbir Singh that (finally) resolves the 
procps CPU accounting bug. The fix you pulled earlier today was correct 
but it solved only half of the problem.

Ingo

-->
Balbir Singh (1):
  sched: fix /proc//stat stime/utime monotonicity, part 2

 fs/proc/array.c   |3 ++-
 include/linux/sched.h |2 +-
 kernel/fork.c |1 +
 3 files changed, 4 insertions(+), 2 deletions(-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git pull] scheduler fix

2007-10-29 Thread Ingo Molnar


Linus, this is a followup git pull request for a single fix:

   git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

Frans Pop has tested a fix from Balbir Singh that (finally) resolves the 
procps CPU accounting bug. The fix you pulled earlier today was correct 
but it solved only half of the problem.

Ingo

--
Balbir Singh (1):
  sched: fix /proc/PID/stat stime/utime monotonicity, part 2

 fs/proc/array.c   |3 ++-
 include/linux/sched.h |2 +-
 kernel/fork.c |1 +
 3 files changed, 4 insertions(+), 2 deletions(-)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

92 matches

Mail list logo