Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-11 Thread Pavel Emelianov
Balbir Singh wrote:
> Hi, Pavel,
> 
> Please find my patch to add LRU behaviour to your latest RSS controller.

Thanks for participation and additional testing :)
I'll include this into next generation of patches.

> Balbir Singh
> Linux Technology Center
> IBM, ISTL
> 
> 
> 
> 
> Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov
> 
>   http://lkml.org/lkml/2007/3/6/198
> 
> which was in turn similar to the RSS controller posted by me
> 
>   http://lkml.org/lkml/2007/2/26/8
> 
> Pavel's patches have a per container list of pages, which helps reduce
> reclaim time of the RSS controller but the per container list of pages is
> in FIFO order. I've implemented active and inactive lists per container to
> help select the right set of pages to reclaim when the container is under
> memory pressure.
> 
> I've tested these patches on a ppc64 machine and they work fine for
> the minimal testing I've done.
> 
> Pavel would you please include these patches in your next iteration.
> 
> Comments, suggestions and further improvements are as always welcome!
> 
> Signed-off-by: <[EMAIL PROTECTED]>
> ---
> 
>  include/linux/rss_container.h |1 
>  mm/rss_container.c|   47 
> +++---
>  mm/swap.c |5 
>  mm/vmscan.c   |3 ++
>  4 files changed, 44 insertions(+), 12 deletions(-)
> 
> diff -puN include/linux/rss_container.h~rss-container-lru2 
> include/linux/rss_container.h
> --- linux-2.6.20/include/linux/rss_container.h~rss-container-lru2 
> 2007-03-09 22:52:56.0 +0530
> +++ linux-2.6.20-balbir/include/linux/rss_container.h 2007-03-10 
> 00:39:59.0 +0530
> @@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
>  void container_rss_add(struct page_container *);
>  void container_rss_del(struct page_container *);
>  void container_rss_release(struct page_container *);
> +void container_rss_move_lists(struct page *pg, bool active);
>  
>  int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
>  void mm_free_container(struct mm_struct *mm);
> diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
> --- linux-2.6.20/mm/rss_container.c~rss-container-lru22007-03-09 
> 22:52:56.0 +0530
> +++ linux-2.6.20-balbir/mm/rss_container.c2007-03-10 02:42:54.0 
> +0530
> @@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
>  
>  struct rss_container {
>   struct res_counter res;
> - struct list_head page_list;
> + struct list_head inactive_list;
> + struct list_head active_list;
>   struct container_subsys_state css;
>  };
>  
> @@ -96,6 +97,26 @@ void container_rss_release(struct page_c
>   kfree(pc);
>  }
>  
> +void container_rss_move_lists(struct page *pg, bool active)
> +{
> + struct rss_container *rss;
> + struct page_container *pc;
> +
> + if (!page_mapped(pg))
> + return;
> +
> + pc = page_container(pg);
> + BUG_ON(!pc);
> + rss = pc->cnt;
> +
> + spin_lock_irq(>res.lock);
> + if (active)
> + list_move(>list, >active_list);
> + else
> + list_move(>list, >inactive_list);
> + spin_unlock_irq(>res.lock);
> +}
> +
>  void container_rss_add(struct page_container *pc)
>  {
>   struct page *pg;
> @@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
>   rss = pc->cnt;
>  
>   spin_lock(>res.lock);
> - list_add(>list, >page_list);
> + list_add(>list, >active_list);
>   spin_unlock(>res.lock);
>  
>   page_container(pg) = pc;
> @@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
>   struct zone *z;
>  
>   spin_lock_irq(>res.lock);
> - src = >page_list;
> + if (active)
> + src = >active_list;
> + else
> + src = >inactive_list;
>  
>   for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
>   pc = list_entry(src->prev, struct page_container, list);
> @@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
>  
>   spin_lock(>lru_lock);
>   if (PageLRU(page)) {
> - if ((active && PageActive(page)) ||
> - (!active && !PageActive(page))) {
> - if (likely(get_page_unless_zero(page))) {
> - ClearPageLRU(page);
> - nr_taken++;
> - list_move(>lru, dst);
> - }
> + if (likely(get_page_unless_zero(page))) {
> + ClearPageLRU(page);
> + nr_taken++;
> + list_move(>lru, dst);
>   }
>   }
>   spin_unlock(>lru_lock);
> @@ -212,7 +233,8 @@ static int rss_create(struct 

Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-11 Thread Pavel Emelianov
Balbir Singh wrote:
 Hi, Pavel,
 
 Please find my patch to add LRU behaviour to your latest RSS controller.

Thanks for participation and additional testing :)
I'll include this into next generation of patches.

 Balbir Singh
 Linux Technology Center
 IBM, ISTL
 
 
 
 
 Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov
 
   http://lkml.org/lkml/2007/3/6/198
 
 which was in turn similar to the RSS controller posted by me
 
   http://lkml.org/lkml/2007/2/26/8
 
 Pavel's patches have a per container list of pages, which helps reduce
 reclaim time of the RSS controller but the per container list of pages is
 in FIFO order. I've implemented active and inactive lists per container to
 help select the right set of pages to reclaim when the container is under
 memory pressure.
 
 I've tested these patches on a ppc64 machine and they work fine for
 the minimal testing I've done.
 
 Pavel would you please include these patches in your next iteration.
 
 Comments, suggestions and further improvements are as always welcome!
 
 Signed-off-by: [EMAIL PROTECTED]
 ---
 
  include/linux/rss_container.h |1 
  mm/rss_container.c|   47 
 +++---
  mm/swap.c |5 
  mm/vmscan.c   |3 ++
  4 files changed, 44 insertions(+), 12 deletions(-)
 
 diff -puN include/linux/rss_container.h~rss-container-lru2 
 include/linux/rss_container.h
 --- linux-2.6.20/include/linux/rss_container.h~rss-container-lru2 
 2007-03-09 22:52:56.0 +0530
 +++ linux-2.6.20-balbir/include/linux/rss_container.h 2007-03-10 
 00:39:59.0 +0530
 @@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
  void container_rss_add(struct page_container *);
  void container_rss_del(struct page_container *);
  void container_rss_release(struct page_container *);
 +void container_rss_move_lists(struct page *pg, bool active);
  
  int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
  void mm_free_container(struct mm_struct *mm);
 diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
 --- linux-2.6.20/mm/rss_container.c~rss-container-lru22007-03-09 
 22:52:56.0 +0530
 +++ linux-2.6.20-balbir/mm/rss_container.c2007-03-10 02:42:54.0 
 +0530
 @@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
  
  struct rss_container {
   struct res_counter res;
 - struct list_head page_list;
 + struct list_head inactive_list;
 + struct list_head active_list;
   struct container_subsys_state css;
  };
  
 @@ -96,6 +97,26 @@ void container_rss_release(struct page_c
   kfree(pc);
  }
  
 +void container_rss_move_lists(struct page *pg, bool active)
 +{
 + struct rss_container *rss;
 + struct page_container *pc;
 +
 + if (!page_mapped(pg))
 + return;
 +
 + pc = page_container(pg);
 + BUG_ON(!pc);
 + rss = pc-cnt;
 +
 + spin_lock_irq(rss-res.lock);
 + if (active)
 + list_move(pc-list, rss-active_list);
 + else
 + list_move(pc-list, rss-inactive_list);
 + spin_unlock_irq(rss-res.lock);
 +}
 +
  void container_rss_add(struct page_container *pc)
  {
   struct page *pg;
 @@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
   rss = pc-cnt;
  
   spin_lock(rss-res.lock);
 - list_add(pc-list, rss-page_list);
 + list_add(pc-list, rss-active_list);
   spin_unlock(rss-res.lock);
  
   page_container(pg) = pc;
 @@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
   struct zone *z;
  
   spin_lock_irq(rss-res.lock);
 - src = rss-page_list;
 + if (active)
 + src = rss-active_list;
 + else
 + src = rss-inactive_list;
  
   for (scan = 0; scan  nr_to_scan  !list_empty(src); scan++) {
   pc = list_entry(src-prev, struct page_container, list);
 @@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
  
   spin_lock(z-lru_lock);
   if (PageLRU(page)) {
 - if ((active  PageActive(page)) ||
 - (!active  !PageActive(page))) {
 - if (likely(get_page_unless_zero(page))) {
 - ClearPageLRU(page);
 - nr_taken++;
 - list_move(page-lru, dst);
 - }
 + if (likely(get_page_unless_zero(page))) {
 + ClearPageLRU(page);
 + nr_taken++;
 + list_move(page-lru, dst);
   }
   }
   spin_unlock(z-lru_lock);
 @@ -212,7 +233,8 @@ static int rss_create(struct container_s
   return -ENOMEM;
  
   res_counter_init(rss-res);
 - 

Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-09 Thread Balbir Singh

Hi, Pavel,

Please find my patch to add LRU behaviour to your latest RSS controller.

Balbir Singh
Linux Technology Center
IBM, ISTL
Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov

	http://lkml.org/lkml/2007/3/6/198

which was in turn similar to the RSS controller posted by me

	http://lkml.org/lkml/2007/2/26/8

Pavel's patches have a per container list of pages, which helps reduce
reclaim time of the RSS controller but the per container list of pages is
in FIFO order. I've implemented active and inactive lists per container to
help select the right set of pages to reclaim when the container is under
memory pressure.

I've tested these patches on a ppc64 machine and they work fine for
the minimal testing I've done.

Pavel would you please include these patches in your next iteration.

Comments, suggestions and further improvements are as always welcome!

Signed-off-by: <[EMAIL PROTECTED]>
---

 include/linux/rss_container.h |1 
 mm/rss_container.c|   47 +++---
 mm/swap.c |5 
 mm/vmscan.c   |3 ++
 4 files changed, 44 insertions(+), 12 deletions(-)

diff -puN include/linux/rss_container.h~rss-container-lru2 include/linux/rss_container.h
--- linux-2.6.20/include/linux/rss_container.h~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/include/linux/rss_container.h	2007-03-10 00:39:59.0 +0530
@@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
 void container_rss_add(struct page_container *);
 void container_rss_del(struct page_container *);
 void container_rss_release(struct page_container *);
+void container_rss_move_lists(struct page *pg, bool active);
 
 int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 void mm_free_container(struct mm_struct *mm);
diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
--- linux-2.6.20/mm/rss_container.c~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/mm/rss_container.c	2007-03-10 02:42:54.0 +0530
@@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
 
 struct rss_container {
 	struct res_counter res;
-	struct list_head page_list;
+	struct list_head inactive_list;
+	struct list_head active_list;
 	struct container_subsys_state css;
 };
 
@@ -96,6 +97,26 @@ void container_rss_release(struct page_c
 	kfree(pc);
 }
 
+void container_rss_move_lists(struct page *pg, bool active)
+{
+	struct rss_container *rss;
+	struct page_container *pc;
+
+	if (!page_mapped(pg))
+		return;
+
+	pc = page_container(pg);
+	BUG_ON(!pc);
+	rss = pc->cnt;
+
+	spin_lock_irq(>res.lock);
+	if (active)
+		list_move(>list, >active_list);
+	else
+		list_move(>list, >inactive_list);
+	spin_unlock_irq(>res.lock);
+}
+
 void container_rss_add(struct page_container *pc)
 {
 	struct page *pg;
@@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
 	rss = pc->cnt;
 
 	spin_lock(>res.lock);
-	list_add(>list, >page_list);
+	list_add(>list, >active_list);
 	spin_unlock(>res.lock);
 
 	page_container(pg) = pc;
@@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
 	struct zone *z;
 
 	spin_lock_irq(>res.lock);
-	src = >page_list;
+	if (active)
+		src = >active_list;
+	else
+		src = >inactive_list;
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		pc = list_entry(src->prev, struct page_container, list);
@@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
 
 		spin_lock(>lru_lock);
 		if (PageLRU(page)) {
-			if ((active && PageActive(page)) ||
-	(!active && !PageActive(page))) {
-if (likely(get_page_unless_zero(page))) {
-	ClearPageLRU(page);
-	nr_taken++;
-	list_move(>lru, dst);
-}
+			if (likely(get_page_unless_zero(page))) {
+ClearPageLRU(page);
+nr_taken++;
+list_move(>lru, dst);
 			}
 		}
 		spin_unlock(>lru_lock);
@@ -212,7 +233,8 @@ static int rss_create(struct container_s
 		return -ENOMEM;
 
 	res_counter_init(>res);
-	INIT_LIST_HEAD(>page_list);
+	INIT_LIST_HEAD(>inactive_list);
+	INIT_LIST_HEAD(>active_list);
 	cont->subsys[rss_subsys.subsys_id] = >css;
 	return 0;
 }
@@ -284,7 +306,8 @@ static __init int rss_create_early(struc
 
 	rss = _rss_container;
 	res_counter_init(>res);
-	INIT_LIST_HEAD(>page_list);
+	INIT_LIST_HEAD(>inactive_list);
+	INIT_LIST_HEAD(>active_list);
 	cont->subsys[rss_subsys.subsys_id] = >css;
 	ss->create = rss_create;
 	return 0;
diff -puN mm/vmscan.c~rss-container-lru2 mm/vmscan.c
--- linux-2.6.20/mm/vmscan.c~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/mm/vmscan.c	2007-03-10 00:42:35.0 +0530
@@ -1142,6 +1142,7 @@ static unsigned long container_shrink_pa
 			else
 add_page_to_inactive_list(z, page);
 			spin_unlock_irq(>lru_lock);
+			container_rss_move_lists(page, false);
 
 			put_page(page);
 		}
@@ -1191,6 +1192,7 @@ static void container_shrink_pages_activ
 		list_move(>lru, >inactive_list);
 		

Re: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-09 Thread Balbir Singh

Hi, Pavel,

Please find my patch to add LRU behaviour to your latest RSS controller.

Balbir Singh
Linux Technology Center
IBM, ISTL
Add LRU behaviour to the RSS controller patches posted by Pavel Emelianov

	http://lkml.org/lkml/2007/3/6/198

which was in turn similar to the RSS controller posted by me

	http://lkml.org/lkml/2007/2/26/8

Pavel's patches have a per container list of pages, which helps reduce
reclaim time of the RSS controller but the per container list of pages is
in FIFO order. I've implemented active and inactive lists per container to
help select the right set of pages to reclaim when the container is under
memory pressure.

I've tested these patches on a ppc64 machine and they work fine for
the minimal testing I've done.

Pavel would you please include these patches in your next iteration.

Comments, suggestions and further improvements are as always welcome!

Signed-off-by: [EMAIL PROTECTED]
---

 include/linux/rss_container.h |1 
 mm/rss_container.c|   47 +++---
 mm/swap.c |5 
 mm/vmscan.c   |3 ++
 4 files changed, 44 insertions(+), 12 deletions(-)

diff -puN include/linux/rss_container.h~rss-container-lru2 include/linux/rss_container.h
--- linux-2.6.20/include/linux/rss_container.h~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/include/linux/rss_container.h	2007-03-10 00:39:59.0 +0530
@@ -19,6 +19,7 @@ int container_rss_prepare(struct page *,
 void container_rss_add(struct page_container *);
 void container_rss_del(struct page_container *);
 void container_rss_release(struct page_container *);
+void container_rss_move_lists(struct page *pg, bool active);
 
 int mm_init_container(struct mm_struct *mm, struct task_struct *tsk);
 void mm_free_container(struct mm_struct *mm);
diff -puN mm/rss_container.c~rss-container-lru2 mm/rss_container.c
--- linux-2.6.20/mm/rss_container.c~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/mm/rss_container.c	2007-03-10 02:42:54.0 +0530
@@ -17,7 +17,8 @@ static struct container_subsys rss_subsy
 
 struct rss_container {
 	struct res_counter res;
-	struct list_head page_list;
+	struct list_head inactive_list;
+	struct list_head active_list;
 	struct container_subsys_state css;
 };
 
@@ -96,6 +97,26 @@ void container_rss_release(struct page_c
 	kfree(pc);
 }
 
+void container_rss_move_lists(struct page *pg, bool active)
+{
+	struct rss_container *rss;
+	struct page_container *pc;
+
+	if (!page_mapped(pg))
+		return;
+
+	pc = page_container(pg);
+	BUG_ON(!pc);
+	rss = pc-cnt;
+
+	spin_lock_irq(rss-res.lock);
+	if (active)
+		list_move(pc-list, rss-active_list);
+	else
+		list_move(pc-list, rss-inactive_list);
+	spin_unlock_irq(rss-res.lock);
+}
+
 void container_rss_add(struct page_container *pc)
 {
 	struct page *pg;
@@ -105,7 +126,7 @@ void container_rss_add(struct page_conta
 	rss = pc-cnt;
 
 	spin_lock(rss-res.lock);
-	list_add(pc-list, rss-page_list);
+	list_add(pc-list, rss-active_list);
 	spin_unlock(rss-res.lock);
 
 	page_container(pg) = pc;
@@ -141,7 +162,10 @@ unsigned long container_isolate_pages(un
 	struct zone *z;
 
 	spin_lock_irq(rss-res.lock);
-	src = rss-page_list;
+	if (active)
+		src = rss-active_list;
+	else
+		src = rss-inactive_list;
 
 	for (scan = 0; scan  nr_to_scan  !list_empty(src); scan++) {
 		pc = list_entry(src-prev, struct page_container, list);
@@ -152,13 +176,10 @@ unsigned long container_isolate_pages(un
 
 		spin_lock(z-lru_lock);
 		if (PageLRU(page)) {
-			if ((active  PageActive(page)) ||
-	(!active  !PageActive(page))) {
-if (likely(get_page_unless_zero(page))) {
-	ClearPageLRU(page);
-	nr_taken++;
-	list_move(page-lru, dst);
-}
+			if (likely(get_page_unless_zero(page))) {
+ClearPageLRU(page);
+nr_taken++;
+list_move(page-lru, dst);
 			}
 		}
 		spin_unlock(z-lru_lock);
@@ -212,7 +233,8 @@ static int rss_create(struct container_s
 		return -ENOMEM;
 
 	res_counter_init(rss-res);
-	INIT_LIST_HEAD(rss-page_list);
+	INIT_LIST_HEAD(rss-inactive_list);
+	INIT_LIST_HEAD(rss-active_list);
 	cont-subsys[rss_subsys.subsys_id] = rss-css;
 	return 0;
 }
@@ -284,7 +306,8 @@ static __init int rss_create_early(struc
 
 	rss = init_rss_container;
 	res_counter_init(rss-res);
-	INIT_LIST_HEAD(rss-page_list);
+	INIT_LIST_HEAD(rss-inactive_list);
+	INIT_LIST_HEAD(rss-active_list);
 	cont-subsys[rss_subsys.subsys_id] = rss-css;
 	ss-create = rss_create;
 	return 0;
diff -puN mm/vmscan.c~rss-container-lru2 mm/vmscan.c
--- linux-2.6.20/mm/vmscan.c~rss-container-lru2	2007-03-09 22:52:56.0 +0530
+++ linux-2.6.20-balbir/mm/vmscan.c	2007-03-10 00:42:35.0 +0530
@@ -1142,6 +1142,7 @@ static unsigned long container_shrink_pa
 			else
 add_page_to_inactive_list(z, page);
 			spin_unlock_irq(z-lru_lock);
+			container_rss_move_lists(page, false);
 
 			put_page(page);
 		}
@@ -1191,6 +1192,7 @@ static void 

[RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-06 Thread Pavel Emelianov
* container_try_to_free_pages() walks containers
  page list and tries to shrink pages. This is based
  on try_to_free_pages() and Co code.
  Called from core code when no resource left at the
  moment of page touching.

* container_out_of_memory() selects a process to be
  killed which mm_struct belongs to container in question.
  Called from core code when no resources left and no
  pages were reclaimed.
diff -upr linux-2.6.20.orig/mm/oom_kill.c linux-2.6.20-0/mm/oom_kill.c
--- linux-2.6.20.orig/mm/oom_kill.c 2007-03-06 13:33:28.0 +0300
+++ linux-2.6.20-0/mm/oom_kill.c2007-03-06 13:33:28.0 +0300
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
@@ -47,7 +48,8 @@ int sysctl_panic_on_oom;
  *of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+   struct rss_container *rss)
 {
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
@@ -60,6 +62,13 @@ unsigned long badness(struct task_struct
return 0;
}
 
+#ifdef CONFIG_RSS_CONTAINER
+   if (rss != NULL && mm->rss_container != rss) {
+   task_unlock(p);
+   return 0;
+   }
+#endif
+
/*
 * The memory size of the process is the basis for the badness.
 */
@@ -200,7 +209,8 @@ static inline int constrained_alloc(stru
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+   struct rss_container *rss)
 {
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -254,7 +264,7 @@ static struct task_struct *select_bad_pr
if (p->oomkilladj == OOM_DISABLE)
continue;
 
-   points = badness(p, uptime.tv_sec);
+   points = badness(p, uptime.tv_sec, rss);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
@@ -435,7 +445,7 @@ retry:
 * Rambo mode: Shoot down a process and hope it solves whatever
 * issues we may have.
 */
-   p = select_bad_process();
+   p = select_bad_process(, NULL);
 
if (PTR_ERR(p) == -1UL)
goto out;
@@ -464,3 +474,27 @@ out:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
 }
+
+#ifdef CONFIG_RSS_CONTAINER
+void container_out_of_memory(struct rss_container *rss)
+{
+   unsigned long points = 0;
+   struct task_struct *p;
+
+   container_lock();
+   read_lock(_lock);
+retry:
+   p = select_bad_process(, rss);
+   if (PTR_ERR(p) == -1UL)
+   goto out;
+
+   if (!p)
+   p = current;
+
+   if (oom_kill_process(p, points, "Container out of memory"))
+   goto retry;
+out:
+   read_unlock(_lock);
+   container_unlock();
+}
+#endif
diff -upr linux-2.6.20.orig/mm/vmscan.c linux-2.6.20-0/mm/vmscan.c
--- linux-2.6.20.orig/mm/vmscan.c   2007-02-04 21:44:54.0 +0300
+++ linux-2.6.20-0/mm/vmscan.c  2007-03-06 13:33:28.0 +0300
@@ -45,6 +45,8 @@
 
 #include "internal.h"
 
+#include 
+
 struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -1097,6 +1099,194 @@ out:
return ret;
 }
 
+#ifdef CONFIG_RSS_CONTAINER
+/*
+ * These are containers' inactive and active pages shrinkers.
+ * Thes works like shrink_inactive_list() and shrink_active_list()
+ *
+ * Two main differences is that container_isolate_pages() is used to isolate
+ * pages, and that reclaim_mapped is considered to be 1 as hitting BC
+ * limit implies we have to shrink _mapped_ pages
+ */
+static unsigned long container_shrink_pages_inactive(unsigned long max_scan,
+   struct rss_container *rss, struct scan_control *sc)
+{
+   LIST_HEAD(page_list);
+   unsigned long nr_scanned = 0;
+   unsigned long nr_reclaimed = 0;
+
+   do {
+   struct page *page;
+   unsigned long nr_taken;
+   unsigned long nr_scan;
+   struct zone *z;
+
+   nr_taken = container_isolate_pages(sc->swap_cluster_max, rss,
+   _list, 0, _scan);
+
+   nr_scanned += nr_scan;
+   nr_reclaimed += shrink_page_list(_list, sc);
+   if (nr_taken == 0)
+   goto done;
+
+   while (!list_empty(_list)) {
+   page = lru_to_page(_list);
+   z = page_zone(page);
+
+   

[RFC][PATCH 5/7] Per-container OOM killer and page reclamation

2007-03-06 Thread Pavel Emelianov
* container_try_to_free_pages() walks containers
  page list and tries to shrink pages. This is based
  on try_to_free_pages() and Co code.
  Called from core code when no resource left at the
  moment of page touching.

* container_out_of_memory() selects a process to be
  killed which mm_struct belongs to container in question.
  Called from core code when no resources left and no
  pages were reclaimed.
diff -upr linux-2.6.20.orig/mm/oom_kill.c linux-2.6.20-0/mm/oom_kill.c
--- linux-2.6.20.orig/mm/oom_kill.c 2007-03-06 13:33:28.0 +0300
+++ linux-2.6.20-0/mm/oom_kill.c2007-03-06 13:33:28.0 +0300
@@ -24,6 +24,7 @@
 #include linux/cpuset.h
 #include linux/module.h
 #include linux/notifier.h
+#include linux/rss_container.h
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
@@ -47,7 +48,8 @@ int sysctl_panic_on_oom;
  *of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+   struct rss_container *rss)
 {
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
@@ -60,6 +62,13 @@ unsigned long badness(struct task_struct
return 0;
}
 
+#ifdef CONFIG_RSS_CONTAINER
+   if (rss != NULL  mm-rss_container != rss) {
+   task_unlock(p);
+   return 0;
+   }
+#endif
+
/*
 * The memory size of the process is the basis for the badness.
 */
@@ -200,7 +209,8 @@ static inline int constrained_alloc(stru
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+   struct rss_container *rss)
 {
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -254,7 +264,7 @@ static struct task_struct *select_bad_pr
if (p-oomkilladj == OOM_DISABLE)
continue;
 
-   points = badness(p, uptime.tv_sec);
+   points = badness(p, uptime.tv_sec, rss);
if (points  *ppoints || !chosen) {
chosen = p;
*ppoints = points;
@@ -435,7 +445,7 @@ retry:
 * Rambo mode: Shoot down a process and hope it solves whatever
 * issues we may have.
 */
-   p = select_bad_process(points);
+   p = select_bad_process(points, NULL);
 
if (PTR_ERR(p) == -1UL)
goto out;
@@ -464,3 +474,27 @@ out:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
 }
+
+#ifdef CONFIG_RSS_CONTAINER
+void container_out_of_memory(struct rss_container *rss)
+{
+   unsigned long points = 0;
+   struct task_struct *p;
+
+   container_lock();
+   read_lock(tasklist_lock);
+retry:
+   p = select_bad_process(points, rss);
+   if (PTR_ERR(p) == -1UL)
+   goto out;
+
+   if (!p)
+   p = current;
+
+   if (oom_kill_process(p, points, Container out of memory))
+   goto retry;
+out:
+   read_unlock(tasklist_lock);
+   container_unlock();
+}
+#endif
diff -upr linux-2.6.20.orig/mm/vmscan.c linux-2.6.20-0/mm/vmscan.c
--- linux-2.6.20.orig/mm/vmscan.c   2007-02-04 21:44:54.0 +0300
+++ linux-2.6.20-0/mm/vmscan.c  2007-03-06 13:33:28.0 +0300
@@ -45,6 +45,8 @@
 
 #include internal.h
 
+#include linux/rss_container.h
+
 struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -1097,6 +1099,194 @@ out:
return ret;
 }
 
+#ifdef CONFIG_RSS_CONTAINER
+/*
+ * These are containers' inactive and active pages shrinkers.
+ * Thes works like shrink_inactive_list() and shrink_active_list()
+ *
+ * Two main differences is that container_isolate_pages() is used to isolate
+ * pages, and that reclaim_mapped is considered to be 1 as hitting BC
+ * limit implies we have to shrink _mapped_ pages
+ */
+static unsigned long container_shrink_pages_inactive(unsigned long max_scan,
+   struct rss_container *rss, struct scan_control *sc)
+{
+   LIST_HEAD(page_list);
+   unsigned long nr_scanned = 0;
+   unsigned long nr_reclaimed = 0;
+
+   do {
+   struct page *page;
+   unsigned long nr_taken;
+   unsigned long nr_scan;
+   struct zone *z;
+
+   nr_taken = container_isolate_pages(sc-swap_cluster_max, rss,
+   page_list, 0, nr_scan);
+
+   nr_scanned += nr_scan;
+   nr_reclaimed += shrink_page_list(page_list, sc);
+   if (nr_taken == 0)
+   goto done;
+
+   while (!list_empty(page_list)) {
+