great, __schedule() doesn't need pay any attention to the TASK_DEAD now.
on 09/14/2016 12:37 AM, Peter Zijlstra wrote: > On Tue, Sep 13, 2016 at 06:14:27PM +0200, Oleg Nesterov wrote: > >> Me too, and I failed to find something which could be broken... So >> perhaps should make it nop and investigate the new bug reports after >> that. > > Works for me :-) > >> >> Hmm. And preempt_enable_no_resched_notrace() under TASK_DEAD in >> __schedule() should be removed it seems, do_exit() can call __schedule() >> directly. > > > something like so? > > --- > > include/linux/kernel.h | 2 +- > include/linux/sched.h | 2 ++ > kernel/exit.c | 11 ++--------- > kernel/sched/core.c | 23 ++++++++++++----------- > 4 files changed, 17 insertions(+), 21 deletions(-) > > diff --git a/include/linux/kernel.h b/include/linux/kernel.h > index d96a6118d26a..e5bd9cdd2e24 100644 > --- a/include/linux/kernel.h > +++ b/include/linux/kernel.h > @@ -266,7 +266,7 @@ extern void oops_enter(void); > extern void oops_exit(void); > void print_oops_end_marker(void); > extern int oops_may_print(void); > -void do_exit(long error_code) > +void __noreturn do_exit(long error_code) > __noreturn; > void complete_and_exit(struct completion *, long) > __noreturn; > diff --git a/include/linux/sched.h b/include/linux/sched.h > index eb64fcd89e68..b0c818a05b2e 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -448,6 +448,8 @@ static inline void io_schedule(void) > io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); > } > > +void __noreturn do_task_dead(void); > + > struct nsproxy; > struct user_namespace; > > diff --git a/kernel/exit.c b/kernel/exit.c > index 091a78be3b09..d4c12692f766 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -725,7 +725,7 @@ static void check_stack_usage(void) > static inline void check_stack_usage(void) {} > #endif > > -void do_exit(long code) > +void __noreturn do_exit(long code) > { > struct task_struct *tsk = current; > int group_dead; > @@ -897,14 +897,7 @@ void do_exit(long code) > smp_mb(); > raw_spin_unlock_wait(&tsk->pi_lock); > > - /* causes final put_task_struct in finish_task_switch(). */ > - tsk->state = TASK_DEAD; > - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ > - schedule(); > - BUG(); > - /* Avoid "noreturn function does return". */ > - for (;;) > - cpu_relax(); /* For when BUG is null */ > + do_task_dead(); > } > EXPORT_SYMBOL_GPL(do_exit); > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index a0086a5fc008..6034f269000f 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3327,17 +3327,6 @@ static void __sched notrace __schedule(bool preempt) > rq = cpu_rq(cpu); > prev = rq->curr; > > - /* > - * do_exit() calls schedule() with preemption disabled as an exception; > - * however we must fix that up, otherwise the next task will see an > - * inconsistent (higher) preempt count. > - * > - * It also avoids the below schedule_debug() test from complaining > - * about this. > - */ > - if (unlikely(prev->state == TASK_DEAD)) > - preempt_enable_no_resched_notrace(); > - > schedule_debug(prev); > > if (sched_feat(HRTICK)) > @@ -3404,6 +3393,18 @@ static void __sched notrace __schedule(bool preempt) > balance_callback(rq); > } > > +void __noreturn do_task_dead(void) > +{ > + /* causes final put_task_struct in finish_task_switch(). */ > + __set_current_state(TASK_DEAD); > + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ > + __schedule(false); > + BUG(); > + /* Avoid "noreturn function does return". */ > + for (;;) > + cpu_relax(); /* For when BUG is null */ > +} > + > static inline void sched_submit_work(struct task_struct *tsk) > { > if (!tsk->state || tsk_is_pi_blocked(tsk)) >