Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-08 Thread Sha Zhengju

On 11/08/2012 06:34 AM, Michal Hocko wrote:

On Wed 07-11-12 16:41:59, Sha Zhengju wrote:

From: Sha Zhengju

If memcg oom happening, don't scan all system tasks to dump memory state of
eligible tasks, instead we iterates only over the process attached to the oom
memcg and avoid the rcu lock.

you have replaced rcu lock by css_set_lock which is, well, heavier than
rcu. Besides that the patch is not correct because you have excluded
all tasks that are from subgroups because you iterate only through the
top level one.
I am not sure the whole optimization would be a win even if implemented
correctly. Well, we scan through more tasks currently and most of them
are not relevant but then you would need to exclude task_in_mem_cgroup
from oom_unkillable_task and that would be more code churn than the
win.


Thanks for your and David's advice.
This piece is trying to save some expense while dumping memcg tasks, but 
failed to

scanning subgroups by iterating the cgroup. I'm agreed with your cost
opinion, so I decide to give up this one. : )


Thanks,
Sha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-08 Thread Sha Zhengju

On 11/08/2012 06:34 AM, Michal Hocko wrote:

On Wed 07-11-12 16:41:59, Sha Zhengju wrote:

From: Sha Zhengjuhandai@taobao.com

If memcg oom happening, don't scan all system tasks to dump memory state of
eligible tasks, instead we iterates only over the process attached to the oom
memcg and avoid the rcu lock.

you have replaced rcu lock by css_set_lock which is, well, heavier than
rcu. Besides that the patch is not correct because you have excluded
all tasks that are from subgroups because you iterate only through the
top level one.
I am not sure the whole optimization would be a win even if implemented
correctly. Well, we scan through more tasks currently and most of them
are not relevant but then you would need to exclude task_in_mem_cgroup
from oom_unkillable_task and that would be more code churn than the
win.


Thanks for your and David's advice.
This piece is trying to save some expense while dumping memcg tasks, but 
failed to

scanning subgroups by iterating the cgroup. I'm agreed with your costwin
opinion, so I decide to give up this one. : )


Thanks,
Sha

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 16:41:59, Sha Zhengju wrote:
> From: Sha Zhengju 
> 
> If memcg oom happening, don't scan all system tasks to dump memory state of
> eligible tasks, instead we iterates only over the process attached to the oom
> memcg and avoid the rcu lock.

you have replaced rcu lock by css_set_lock which is, well, heavier than
rcu. Besides that the patch is not correct because you have excluded
all tasks that are from subgroups because you iterate only through the
top level one.
I am not sure the whole optimization would be a win even if implemented
correctly. Well, we scan through more tasks currently and most of them
are not relevant but then you would need to exclude task_in_mem_cgroup
from oom_unkillable_task and that would be more code churn than the
win.

> Signed-off-by: Sha Zhengju 
> Cc: Michal Hocko 
> Cc: KAMEZAWA Hiroyuki 
> Cc: David Rientjes 
> Cc: Andrew Morton 
> ---
>  include/linux/memcontrol.h |7 +
>  include/linux/oom.h|2 +
>  mm/memcontrol.c|   14 +++
>  mm/oom_kill.c  |   55 ++-
>  4 files changed, 56 insertions(+), 22 deletions(-)
> 
[...]
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 20b5c46..9ba3344 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
> task_struct *task,
>   unsigned long totalpages, const nodemask_t *nodemask,
>   bool force_kill);
>  
> +extern inline void dump_per_task(struct task_struct *p,
> + const nodemask_t *nodemask);
>  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>   int order, nodemask_t *mask, bool force_kill);
>  extern int register_oom_notifier(struct notifier_block *nb);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2df5e72..fe648f8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
> *memcg)
>   return min(limit, memsw);
>  }
>  
> +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
> *nodemask)
> +{
> + struct cgroup_iter it;
> + struct task_struct *task;
> + struct cgroup *cgroup = memcg->css.cgroup;
> +
> + cgroup_iter_start(cgroup, );
> + while ((task = cgroup_iter_next(cgroup, ))) {
> + dump_per_task(task, nodemask);
> + }
> +
> + cgroup_iter_end(cgroup, );
> +}
> +
>  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
> gfp_mask,
>int order)
>  {
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 4b8a6dd..aaf6237 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>   return chosen;
>  }
>  
> +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
> +{
> + struct task_struct *task;
> +
> + if (oom_unkillable_task(p, NULL, nodemask))
> + return;
> +
> + task = find_lock_task_mm(p);
> + if (!task) {
> + /*
> +  * This is a kthread or all of p's threads have already
> +  * detached their mm's.  There's no need to report
> +  * them; they can't be oom killed anyway.
> +  */
> + return;
> + }
> +
> + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
> + task->pid, from_kuid(_user_ns, task_uid(task)),
> + task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> + task->mm->nr_ptes,
> + get_mm_counter(task->mm, MM_SWAPENTS),
> + task->signal->oom_score_adj, task->comm);
> + task_unlock(task);
> +}
> +
>  /**
>   * dump_tasks - dump current memory state of all system tasks
>   * @memcg: current's memory controller, if constrained
> @@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>  static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
> *nodemask)
>  {
>   struct task_struct *p;
> - struct task_struct *task;
>  
>   pr_info("[ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
> oom_score_adj name\n");
> - rcu_read_lock();
> - for_each_process(p) {
> - if (oom_unkillable_task(p, memcg, nodemask))
> - continue;
> -
> - task = find_lock_task_mm(p);
> - if (!task) {
> - /*
> -  * This is a kthread or all of p's threads have already
> -  * detached their mm's.  There's no need to report
> -  * them; they can't be oom killed anyway.
> -  */
> - continue;
> - }
>  
> - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
> - task->pid, from_kuid(_user_ns, 

Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread David Rientjes
On Wed, 7 Nov 2012, Sha Zhengju wrote:

> From: Sha Zhengju 
> 
> If memcg oom happening, don't scan all system tasks to dump memory state of
> eligible tasks, instead we iterates only over the process attached to the oom
> memcg and avoid the rcu lock.
> 

Avoiding the rcu lock isn't actually that impressive here, the cgroup 
iterator will use it's own lock for that memcg.

> Signed-off-by: Sha Zhengju 
> Cc: Michal Hocko 
> Cc: KAMEZAWA Hiroyuki 
> Cc: David Rientjes 
> Cc: Andrew Morton 
> ---
>  include/linux/memcontrol.h |7 +
>  include/linux/oom.h|2 +
>  mm/memcontrol.c|   14 +++
>  mm/oom_kill.c  |   55 ++-
>  4 files changed, 56 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c91e3c1..4322ca8 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
> *lruvec, enum lru_list);
>  void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
>  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
>   struct task_struct *p);
> +extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
> + const nodemask_t *nodemask);

Shouldn't need the nodemask parameter, just have dump_tasks_memcg() pass 
NULL to dump_per_task(), we won't be isolating to tasks with mempolicies 
restricted to a particular set of nodes since we're in the memcg oom path 
here, not the global page allocator oom path.

>  extern void mem_cgroup_replace_page_cache(struct page *oldpage,
>   struct page *newpage);
>  
> @@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
> struct task_struct *p)
>  {
>  }
>  
> +static inline void
> +dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +{
> +}
> +
>  static inline void mem_cgroup_begin_update_page_stat(struct page *page,
>   bool *locked, unsigned long *flags)
>  {
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 20b5c46..9ba3344 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
> task_struct *task,
>   unsigned long totalpages, const nodemask_t *nodemask,
>   bool force_kill);
>  
> +extern inline void dump_per_task(struct task_struct *p,
> + const nodemask_t *nodemask);

This is a global symbol, so dump_per_task() doesn't make a lot of sense: 
it would need to be prefixed with "oom_" so perhaps oom_dump_task() is 
better?

>  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>   int order, nodemask_t *mask, bool force_kill);
>  extern int register_oom_notifier(struct notifier_block *nb);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2df5e72..fe648f8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
> *memcg)
>   return min(limit, memsw);
>  }
>  
> +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
> *nodemask)
> +{
> + struct cgroup_iter it;
> + struct task_struct *task;
> + struct cgroup *cgroup = memcg->css.cgroup;
> +
> + cgroup_iter_start(cgroup, );
> + while ((task = cgroup_iter_next(cgroup, ))) {
> + dump_per_task(task, nodemask);
> + }
> +
> + cgroup_iter_end(cgroup, );
> +}
> +
>  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
> gfp_mask,
>int order)
>  {
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 4b8a6dd..aaf6237 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>   return chosen;
>  }
>  
> +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)

No inline.

> +{
> + struct task_struct *task;
> +
> + if (oom_unkillable_task(p, NULL, nodemask))
> + return;
> +
> + task = find_lock_task_mm(p);
> + if (!task) {
> + /*
> +  * This is a kthread or all of p's threads have already
> +  * detached their mm's.  There's no need to report
> +  * them; they can't be oom killed anyway.
> +  */
> + return;
> + }
> +
> + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
> + task->pid, from_kuid(_user_ns, task_uid(task)),
> + task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> + task->mm->nr_ptes,
> + get_mm_counter(task->mm, MM_SWAPENTS),
> + task->signal->oom_score_adj, task->comm);
> + task_unlock(task);
> +}
> +
>  /**
>   

[PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread Sha Zhengju
From: Sha Zhengju 

If memcg oom happening, don't scan all system tasks to dump memory state of
eligible tasks, instead we iterates only over the process attached to the oom
memcg and avoid the rcu lock.


Signed-off-by: Sha Zhengju 
Cc: Michal Hocko 
Cc: KAMEZAWA Hiroyuki 
Cc: David Rientjes 
Cc: Andrew Morton 
---
 include/linux/memcontrol.h |7 +
 include/linux/oom.h|2 +
 mm/memcontrol.c|   14 +++
 mm/oom_kill.c  |   55 ++-
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c91e3c1..4322ca8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
*lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
+extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
+   const nodemask_t *nodemask);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage);
 
@@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct 
task_struct *p)
 {
 }
 
+static inline void
+dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+{
+}
+
 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 20b5c46..9ba3344 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
bool force_kill);
 
+extern inline void dump_per_task(struct task_struct *p,
+   const nodemask_t *nodemask);
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2df5e72..fe648f8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
return min(limit, memsw);
 }
 
+void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
*nodemask)
+{
+   struct cgroup_iter it;
+   struct task_struct *task;
+   struct cgroup *cgroup = memcg->css.cgroup;
+
+   cgroup_iter_start(cgroup, );
+   while ((task = cgroup_iter_next(cgroup, ))) {
+   dump_per_task(task, nodemask);
+   }
+
+   cgroup_iter_end(cgroup, );
+}
+
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 int order)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4b8a6dd..aaf6237 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned int 
*ppoints,
return chosen;
 }
 
+inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
+{
+   struct task_struct *task;
+
+   if (oom_unkillable_task(p, NULL, nodemask))
+   return;
+
+   task = find_lock_task_mm(p);
+   if (!task) {
+   /*
+* This is a kthread or all of p's threads have already
+* detached their mm's.  There's no need to report
+* them; they can't be oom killed anyway.
+*/
+   return;
+   }
+
+   pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
+   task->pid, from_kuid(_user_ns, task_uid(task)),
+   task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+   task->mm->nr_ptes,
+   get_mm_counter(task->mm, MM_SWAPENTS),
+   task->signal->oom_score_adj, task->comm);
+   task_unlock(task);
+}
+
 /**
  * dump_tasks - dump current memory state of all system tasks
  * @memcg: current's memory controller, if constrained
@@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
int *ppoints,
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
*nodemask)
 {
struct task_struct *p;
-   struct task_struct *task;
 
pr_info("[ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
oom_score_adj name\n");
-   rcu_read_lock();
-   for_each_process(p) {
-   if (oom_unkillable_task(p, memcg, nodemask))
-   continue;
-
-   task = find_lock_task_mm(p);
-   if (!task) {
-   /*
-* This is a kthread or all of p's 

[PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread Sha Zhengju
From: Sha Zhengju handai@taobao.com

If memcg oom happening, don't scan all system tasks to dump memory state of
eligible tasks, instead we iterates only over the process attached to the oom
memcg and avoid the rcu lock.


Signed-off-by: Sha Zhengju handai@taobao.com
Cc: Michal Hocko mho...@suse.cz
Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
Cc: David Rientjes rient...@google.com
Cc: Andrew Morton a...@linux-foundation.org
---
 include/linux/memcontrol.h |7 +
 include/linux/oom.h|2 +
 mm/memcontrol.c|   14 +++
 mm/oom_kill.c  |   55 ++-
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c91e3c1..4322ca8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
*lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
+extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
+   const nodemask_t *nodemask);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage);
 
@@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct 
task_struct *p)
 {
 }
 
+static inline void
+dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+{
+}
+
 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 20b5c46..9ba3344 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
bool force_kill);
 
+extern inline void dump_per_task(struct task_struct *p,
+   const nodemask_t *nodemask);
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2df5e72..fe648f8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
return min(limit, memsw);
 }
 
+void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
*nodemask)
+{
+   struct cgroup_iter it;
+   struct task_struct *task;
+   struct cgroup *cgroup = memcg-css.cgroup;
+
+   cgroup_iter_start(cgroup, it);
+   while ((task = cgroup_iter_next(cgroup, it))) {
+   dump_per_task(task, nodemask);
+   }
+
+   cgroup_iter_end(cgroup, it);
+}
+
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 int order)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4b8a6dd..aaf6237 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned int 
*ppoints,
return chosen;
 }
 
+inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
+{
+   struct task_struct *task;
+
+   if (oom_unkillable_task(p, NULL, nodemask))
+   return;
+
+   task = find_lock_task_mm(p);
+   if (!task) {
+   /*
+* This is a kthread or all of p's threads have already
+* detached their mm's.  There's no need to report
+* them; they can't be oom killed anyway.
+*/
+   return;
+   }
+
+   pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
+   task-pid, from_kuid(init_user_ns, task_uid(task)),
+   task-tgid, task-mm-total_vm, get_mm_rss(task-mm),
+   task-mm-nr_ptes,
+   get_mm_counter(task-mm, MM_SWAPENTS),
+   task-signal-oom_score_adj, task-comm);
+   task_unlock(task);
+}
+
 /**
  * dump_tasks - dump current memory state of all system tasks
  * @memcg: current's memory controller, if constrained
@@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
int *ppoints,
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
*nodemask)
 {
struct task_struct *p;
-   struct task_struct *task;
 
pr_info([ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
oom_score_adj name\n);
-   rcu_read_lock();
-   for_each_process(p) {
-   if (oom_unkillable_task(p, memcg, nodemask))
-   continue;
-
-   task = 

Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread David Rientjes
On Wed, 7 Nov 2012, Sha Zhengju wrote:

 From: Sha Zhengju handai@taobao.com
 
 If memcg oom happening, don't scan all system tasks to dump memory state of
 eligible tasks, instead we iterates only over the process attached to the oom
 memcg and avoid the rcu lock.
 

Avoiding the rcu lock isn't actually that impressive here, the cgroup 
iterator will use it's own lock for that memcg.

 Signed-off-by: Sha Zhengju handai@taobao.com
 Cc: Michal Hocko mho...@suse.cz
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: David Rientjes rient...@google.com
 Cc: Andrew Morton a...@linux-foundation.org
 ---
  include/linux/memcontrol.h |7 +
  include/linux/oom.h|2 +
  mm/memcontrol.c|   14 +++
  mm/oom_kill.c  |   55 ++-
  4 files changed, 56 insertions(+), 22 deletions(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index c91e3c1..4322ca8 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec 
 *lruvec, enum lru_list);
  void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
   struct task_struct *p);
 +extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
 + const nodemask_t *nodemask);

Shouldn't need the nodemask parameter, just have dump_tasks_memcg() pass 
NULL to dump_per_task(), we won't be isolating to tasks with mempolicies 
restricted to a particular set of nodes since we're in the memcg oom path 
here, not the global page allocator oom path.

  extern void mem_cgroup_replace_page_cache(struct page *oldpage,
   struct page *newpage);
  
 @@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
 struct task_struct *p)
  {
  }
  
 +static inline void
 +dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 +{
 +}
 +
  static inline void mem_cgroup_begin_update_page_stat(struct page *page,
   bool *locked, unsigned long *flags)
  {
 diff --git a/include/linux/oom.h b/include/linux/oom.h
 index 20b5c46..9ba3344 100644
 --- a/include/linux/oom.h
 +++ b/include/linux/oom.h
 @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
 task_struct *task,
   unsigned long totalpages, const nodemask_t *nodemask,
   bool force_kill);
  
 +extern inline void dump_per_task(struct task_struct *p,
 + const nodemask_t *nodemask);

This is a global symbol, so dump_per_task() doesn't make a lot of sense: 
it would need to be prefixed with oom_ so perhaps oom_dump_task() is 
better?

  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
   int order, nodemask_t *mask, bool force_kill);
  extern int register_oom_notifier(struct notifier_block *nb);
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 2df5e72..fe648f8 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
 *memcg)
   return min(limit, memsw);
  }
  
 +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
 *nodemask)
 +{
 + struct cgroup_iter it;
 + struct task_struct *task;
 + struct cgroup *cgroup = memcg-css.cgroup;
 +
 + cgroup_iter_start(cgroup, it);
 + while ((task = cgroup_iter_next(cgroup, it))) {
 + dump_per_task(task, nodemask);
 + }
 +
 + cgroup_iter_end(cgroup, it);
 +}
 +
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
 gfp_mask,
int order)
  {
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
 index 4b8a6dd..aaf6237 100644
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
 int *ppoints,
   return chosen;
  }
  
 +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)

No inline.

 +{
 + struct task_struct *task;
 +
 + if (oom_unkillable_task(p, NULL, nodemask))
 + return;
 +
 + task = find_lock_task_mm(p);
 + if (!task) {
 + /*
 +  * This is a kthread or all of p's threads have already
 +  * detached their mm's.  There's no need to report
 +  * them; they can't be oom killed anyway.
 +  */
 + return;
 + }
 +
 + pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
 + task-pid, from_kuid(init_user_ns, task_uid(task)),
 + task-tgid, task-mm-total_vm, get_mm_rss(task-mm),
 + task-mm-nr_ptes,
 + get_mm_counter(task-mm, MM_SWAPENTS),
 + task-signal-oom_score_adj, task-comm);
 + task_unlock(task);
 +}
 +
  /**
   

Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom situation

2012-11-07 Thread Michal Hocko
On Wed 07-11-12 16:41:59, Sha Zhengju wrote:
 From: Sha Zhengju handai@taobao.com
 
 If memcg oom happening, don't scan all system tasks to dump memory state of
 eligible tasks, instead we iterates only over the process attached to the oom
 memcg and avoid the rcu lock.

you have replaced rcu lock by css_set_lock which is, well, heavier than
rcu. Besides that the patch is not correct because you have excluded
all tasks that are from subgroups because you iterate only through the
top level one.
I am not sure the whole optimization would be a win even if implemented
correctly. Well, we scan through more tasks currently and most of them
are not relevant but then you would need to exclude task_in_mem_cgroup
from oom_unkillable_task and that would be more code churn than the
win.

 Signed-off-by: Sha Zhengju handai@taobao.com
 Cc: Michal Hocko mho...@suse.cz
 Cc: KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com
 Cc: David Rientjes rient...@google.com
 Cc: Andrew Morton a...@linux-foundation.org
 ---
  include/linux/memcontrol.h |7 +
  include/linux/oom.h|2 +
  mm/memcontrol.c|   14 +++
  mm/oom_kill.c  |   55 ++-
  4 files changed, 56 insertions(+), 22 deletions(-)
 
[...]
 diff --git a/include/linux/oom.h b/include/linux/oom.h
 index 20b5c46..9ba3344 100644
 --- a/include/linux/oom.h
 +++ b/include/linux/oom.h
 @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct 
 task_struct *task,
   unsigned long totalpages, const nodemask_t *nodemask,
   bool force_kill);
  
 +extern inline void dump_per_task(struct task_struct *p,
 + const nodemask_t *nodemask);
  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
   int order, nodemask_t *mask, bool force_kill);
  extern int register_oom_notifier(struct notifier_block *nb);
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 2df5e72..fe648f8 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup 
 *memcg)
   return min(limit, memsw);
  }
  
 +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t 
 *nodemask)
 +{
 + struct cgroup_iter it;
 + struct task_struct *task;
 + struct cgroup *cgroup = memcg-css.cgroup;
 +
 + cgroup_iter_start(cgroup, it);
 + while ((task = cgroup_iter_next(cgroup, it))) {
 + dump_per_task(task, nodemask);
 + }
 +
 + cgroup_iter_end(cgroup, it);
 +}
 +
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t 
 gfp_mask,
int order)
  {
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
 index 4b8a6dd..aaf6237 100644
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned 
 int *ppoints,
   return chosen;
  }
  
 +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
 +{
 + struct task_struct *task;
 +
 + if (oom_unkillable_task(p, NULL, nodemask))
 + return;
 +
 + task = find_lock_task_mm(p);
 + if (!task) {
 + /*
 +  * This is a kthread or all of p's threads have already
 +  * detached their mm's.  There's no need to report
 +  * them; they can't be oom killed anyway.
 +  */
 + return;
 + }
 +
 + pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
 + task-pid, from_kuid(init_user_ns, task_uid(task)),
 + task-tgid, task-mm-total_vm, get_mm_rss(task-mm),
 + task-mm-nr_ptes,
 + get_mm_counter(task-mm, MM_SWAPENTS),
 + task-signal-oom_score_adj, task-comm);
 + task_unlock(task);
 +}
 +
  /**
   * dump_tasks - dump current memory state of all system tasks
   * @memcg: current's memory controller, if constrained
 @@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned 
 int *ppoints,
  static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t 
 *nodemask)
  {
   struct task_struct *p;
 - struct task_struct *task;
  
   pr_info([ pid ]   uid  tgid total_vm  rss nr_ptes swapents 
 oom_score_adj name\n);
 - rcu_read_lock();
 - for_each_process(p) {
 - if (oom_unkillable_task(p, memcg, nodemask))
 - continue;
 -
 - task = find_lock_task_mm(p);
 - if (!task) {
 - /*
 -  * This is a kthread or all of p's threads have already
 -  * detached their mm's.  There's no need to report
 -  * them; they can't be oom killed anyway.
 -  */
 - continue;
 - }
  
 - pr_info([%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n,
 - task-pid,