Re: + mm-oom-avoid-printk-iteration-under-rcu.patch added to -mm tree

2019-07-24 Thread Michal Hocko
[ups, I've cut the Cc list somehow, sorry about that]

On Wed 24-07-19 08:27:27, Michal Hocko wrote:
> Andrew,
> I've had some concerns wrt. this patch - especially the additional
> complexity - and I have to say I am not convinced that this is really
> needed. Our past experience in this area suggests that more tricky code
> leads to different corner cases. So I am really reluctant to add more
> complexity without any real world reports.
> 
> On Tue 23-07-19 16:14:29, Andrew Morton wrote:
> > From: Tetsuo Handa 
> > Subject: mm, oom: avoid printk() iteration under RCU
> > 
> > Currently dump_tasks() might call printk() for many thousands times under
> > RCU, which might take many minutes for slow consoles.  Therefore, split
> > dump_tasks() into three stages; take a snapshot of possible OOM victim
> > candidates under RCU, dump the snapshot from reschedulable context, and
> > destroy the snapshot.
> > 
> > In a future patch, the first stage would be moved to select_bad_process()
> > and the third stage would be moved to after oom_kill_process(), and will
> > simplify refcount handling.
> > 
> > Link: 
> > http://lkml.kernel.org/r/1563360901-8277-1-git-send-email-penguin-ker...@i-love.sakura.ne.jp
> > Signed-off-by: Tetsuo Handa 
> > Cc: Shakeel Butt 
> > Cc: Michal Hocko 
> > Cc: Roman Gushchin 
> > Cc: David Rientjes 
> > Signed-off-by: Andrew Morton 
> > ---
> > 
> >  include/linux/sched.h |1 
> >  mm/oom_kill.c |   67 +++-
> >  2 files changed, 34 insertions(+), 34 deletions(-)
> > 
> > --- a/include/linux/sched.h~mm-oom-avoid-printk-iteration-under-rcu
> > +++ a/include/linux/sched.h
> > @@ -1246,6 +1246,7 @@ struct task_struct {
> >  #ifdef CONFIG_MMU
> > struct task_struct  *oom_reaper_list;
> >  #endif
> > +   struct list_headoom_victim_list;
> >  #ifdef CONFIG_VMAP_STACK
> > struct vm_struct*stack_vm_area;
> >  #endif
> > --- a/mm/oom_kill.c~mm-oom-avoid-printk-iteration-under-rcu
> > +++ a/mm/oom_kill.c
> > @@ -377,36 +377,13 @@ static void select_bad_process(struct oo
> > }
> >  }
> >  
> > -static int dump_task(struct task_struct *p, void *arg)
> > -{
> > -   struct oom_control *oc = arg;
> > -   struct task_struct *task;
> > -
> > -   if (oom_unkillable_task(p))
> > -   return 0;
> >  
> > -   /* p may not have freeable memory in nodemask */
> > -   if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> > -   return 0;
> > -
> > -   task = find_lock_task_mm(p);
> > -   if (!task) {
> > -   /*
> > -* This is a kthread or all of p's threads have already
> > -* detached their mm's.  There's no need to report
> > -* them; they can't be oom killed anyway.
> > -*/
> > -   return 0;
> > +static int add_candidate_task(struct task_struct *p, void *arg)
> > +{
> > +   if (!oom_unkillable_task(p)) {
> > +   get_task_struct(p);
> > +   list_add_tail(>oom_victim_list, (struct list_head *) arg);
> > }
> > -
> > -   pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
> > -   task->pid, from_kuid(_user_ns, task_uid(task)),
> > -   task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> > -   mm_pgtables_bytes(task->mm),
> > -   get_mm_counter(task->mm, MM_SWAPENTS),
> > -   task->signal->oom_score_adj, task->comm);
> > -   task_unlock(task);
> > -
> > return 0;
> >  }
> >  
> > @@ -422,19 +399,41 @@ static int dump_task(struct task_struct
> >   */
> >  static void dump_tasks(struct oom_control *oc)
> >  {
> > -   pr_info("Tasks state (memory values in pages):\n");
> > -   pr_info("[  pid  ]   uid  tgid total_vm  rss pgtables_bytes 
> > swapents oom_score_adj name\n");
> > +   static LIST_HEAD(list);
> > +   struct task_struct *p;
> > +   struct task_struct *t;
> >  
> > if (is_memcg_oom(oc))
> > -   mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> > +   mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, );
> > else {
> > -   struct task_struct *p;
> > -
> > rcu_read_lock();
> > for_each_process(p)
> > -   dump_task(p, oc);
> > +   add_candidate_task(p, );
> > rcu_read_unlock();
> > }
> > +   pr_info("Tasks state (memory values in pages):\n");
> > +   pr_info("[  pid  ]   uid  tgid total_vm  rss pgtables_bytes 
> > swapents oom_score_adj name\n");
> > +   list_for_each_entry(p, , oom_victim_list) {
> > +   cond_resched();
> > +   /* p may not have freeable memory in nodemask */
> > +   if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> > +   continue;
> > +   /* All of p's threads might have already detached their mm's. */
> > +   t = find_lock_task_mm(p);
> > +   if (!t)
> > +   continue;
> > +   pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu 

Re: + mm-oom-avoid-printk-iteration-under-rcu.patch added to -mm tree

2019-07-24 Thread Michal Hocko
Andrew,
I've had some concerns wrt. this patch - especially the additional
complexity - and I have to say I am not convinced that this is really
needed. Our past experience in this area suggests that more tricky code
leads to different corner cases. So I am really reluctant to add more
complexity without any real world reports.

On Tue 23-07-19 16:14:29, Andrew Morton wrote:
> From: Tetsuo Handa 
> Subject: mm, oom: avoid printk() iteration under RCU
> 
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles.  Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
> 
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.
> 
> Link: 
> http://lkml.kernel.org/r/1563360901-8277-1-git-send-email-penguin-ker...@i-love.sakura.ne.jp
> Signed-off-by: Tetsuo Handa 
> Cc: Shakeel Butt 
> Cc: Michal Hocko 
> Cc: Roman Gushchin 
> Cc: David Rientjes 
> Signed-off-by: Andrew Morton 
> ---
> 
>  include/linux/sched.h |1 
>  mm/oom_kill.c |   67 +++-
>  2 files changed, 34 insertions(+), 34 deletions(-)
> 
> --- a/include/linux/sched.h~mm-oom-avoid-printk-iteration-under-rcu
> +++ a/include/linux/sched.h
> @@ -1246,6 +1246,7 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>   struct task_struct  *oom_reaper_list;
>  #endif
> + struct list_headoom_victim_list;
>  #ifdef CONFIG_VMAP_STACK
>   struct vm_struct*stack_vm_area;
>  #endif
> --- a/mm/oom_kill.c~mm-oom-avoid-printk-iteration-under-rcu
> +++ a/mm/oom_kill.c
> @@ -377,36 +377,13 @@ static void select_bad_process(struct oo
>   }
>  }
>  
> -static int dump_task(struct task_struct *p, void *arg)
> -{
> - struct oom_control *oc = arg;
> - struct task_struct *task;
> -
> - if (oom_unkillable_task(p))
> - return 0;
>  
> - /* p may not have freeable memory in nodemask */
> - if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> - return 0;
> -
> - task = find_lock_task_mm(p);
> - if (!task) {
> - /*
> -  * This is a kthread or all of p's threads have already
> -  * detached their mm's.  There's no need to report
> -  * them; they can't be oom killed anyway.
> -  */
> - return 0;
> +static int add_candidate_task(struct task_struct *p, void *arg)
> +{
> + if (!oom_unkillable_task(p)) {
> + get_task_struct(p);
> + list_add_tail(>oom_victim_list, (struct list_head *) arg);
>   }
> -
> - pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
> - task->pid, from_kuid(_user_ns, task_uid(task)),
> - task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> - mm_pgtables_bytes(task->mm),
> - get_mm_counter(task->mm, MM_SWAPENTS),
> - task->signal->oom_score_adj, task->comm);
> - task_unlock(task);
> -
>   return 0;
>  }
>  
> @@ -422,19 +399,41 @@ static int dump_task(struct task_struct
>   */
>  static void dump_tasks(struct oom_control *oc)
>  {
> - pr_info("Tasks state (memory values in pages):\n");
> - pr_info("[  pid  ]   uid  tgid total_vm  rss pgtables_bytes 
> swapents oom_score_adj name\n");
> + static LIST_HEAD(list);
> + struct task_struct *p;
> + struct task_struct *t;
>  
>   if (is_memcg_oom(oc))
> - mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> + mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, );
>   else {
> - struct task_struct *p;
> -
>   rcu_read_lock();
>   for_each_process(p)
> - dump_task(p, oc);
> + add_candidate_task(p, );
>   rcu_read_unlock();
>   }
> + pr_info("Tasks state (memory values in pages):\n");
> + pr_info("[  pid  ]   uid  tgid total_vm  rss pgtables_bytes 
> swapents oom_score_adj name\n");
> + list_for_each_entry(p, , oom_victim_list) {
> + cond_resched();
> + /* p may not have freeable memory in nodemask */
> + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> + continue;
> + /* All of p's threads might have already detached their mm's. */
> + t = find_lock_task_mm(p);
> + if (!t)
> + continue;
> + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
> + t->pid, from_kuid(_user_ns, task_uid(t)),
> + t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> + mm_pgtables_bytes(t->mm),
> +