Commit-ID:  9af6528ee9b682df7f29dbee86fbba0b67eab944
Gitweb:     http://git.kernel.org/tip/9af6528ee9b682df7f29dbee86fbba0b67eab944
Author:     Peter Zijlstra <pet...@infradead.org>
AuthorDate: Tue, 13 Sep 2016 18:37:29 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Thu, 22 Sep 2016 14:53:45 +0200

sched/core: Optimize __schedule()

Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD
context switch, we can avoid the TASK_DEAD special case currently in
__schedule() because that avoids the extra preempt_disable() from
schedule().

In order to facilitate this, create a do_task_dead() helper which we
place in the scheduler code, such that it can access __schedule().

Also add some __noreturn annotations to the functions, there's no
coming back from do_exit().

Suggested-by: Oleg Nesterov <o...@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Cheng Chao <cs.os.ker...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: a...@linux-foundation.org
Cc: ch...@chris-wilson.co.uk
Cc: t...@kernel.org
Link: 
http://lkml.kernel.org/r/20160913163729.gb5...@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 include/linux/kernel.h |  9 +++------
 include/linux/sched.h  |  2 ++
 kernel/exit.c          | 26 ++------------------------
 kernel/sched/core.c    | 38 +++++++++++++++++++++++++++-----------
 4 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d96a611..74fd6f0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(int state);
 __printf(1, 2)
-void panic(const char *fmt, ...)
-       __noreturn __cold;
+void panic(const char *fmt, ...) __noreturn __cold;
 void nmi_panic(struct pt_regs *regs, const char *msg);
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
 extern int oops_may_print(void);
-void do_exit(long error_code)
-       __noreturn;
-void complete_and_exit(struct completion *, long)
-       __noreturn;
+void do_exit(long error_code) __noreturn;
+void complete_and_exit(struct completion *, long) __noreturn;
 
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long 
*res);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d750240..f00ee8e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
        io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
 }
 
+void __noreturn do_task_dead(void);
+
 struct nsproxy;
 struct user_namespace;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 091a78b..1e1d913 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
-void do_exit(long code)
+void __noreturn do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -882,29 +882,7 @@ void do_exit(long code)
        exit_rcu();
        TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
 
-       /*
-        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-        * when the following two conditions become true.
-        *   - There is race condition of mmap_sem (It is acquired by
-        *     exit_mm()), and
-        *   - SMI occurs before setting TASK_RUNINNG.
-        *     (or hypervisor of virtual machine switches to other guest)
-        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-        *
-        * To avoid it, we have to wait for releasing tsk->pi_lock which
-        * is held by try_to_wake_up()
-        */
-       smp_mb();
-       raw_spin_unlock_wait(&tsk->pi_lock);
-
-       /* causes final put_task_struct in finish_task_switch(). */
-       tsk->state = TASK_DEAD;
-       tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
-       schedule();
-       BUG();
-       /* Avoid "noreturn function does return".  */
-       for (;;)
-               cpu_relax();    /* For when BUG is null */
+       do_task_dead();
 }
 EXPORT_SYMBOL_GPL(do_exit);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff4e3c0..b2ec53c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
        rq = cpu_rq(cpu);
        prev = rq->curr;
 
-       /*
-        * do_exit() calls schedule() with preemption disabled as an exception;
-        * however we must fix that up, otherwise the next task will see an
-        * inconsistent (higher) preempt count.
-        *
-        * It also avoids the below schedule_debug() test from complaining
-        * about this.
-        */
-       if (unlikely(prev->state == TASK_DEAD))
-               preempt_enable_no_resched_notrace();
-
        schedule_debug(prev);
 
        if (sched_feat(HRTICK))
@@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
 
+void __noreturn do_task_dead(void)
+{
+       /*
+        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+        * when the following two conditions become true.
+        *   - There is race condition of mmap_sem (It is acquired by
+        *     exit_mm()), and
+        *   - SMI occurs before setting TASK_RUNINNG.
+        *     (or hypervisor of virtual machine switches to other guest)
+        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+        *
+        * To avoid it, we have to wait for releasing tsk->pi_lock which
+        * is held by try_to_wake_up()
+        */
+       smp_mb();
+       raw_spin_unlock_wait(&current->pi_lock);
+
+       /* causes final put_task_struct in finish_task_switch(). */
+       __set_current_state(TASK_DEAD);
+       current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+       __schedule(false);
+       BUG();
+       /* Avoid "noreturn function does return".  */
+       for (;;)
+               cpu_relax();    /* For when BUG is null */
+}
+
 static inline void sched_submit_work(struct task_struct *tsk)
 {
        if (!tsk->state || tsk_is_pi_blocked(tsk))

Reply via email to