[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-30 Thread Andrea Righi
On Fri, Mar 05, 2010 at 10:12:34AM +0900, Daisuke Nishimura wrote:
 On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi ari...@develer.com wrote:
  Infrastructure to account dirty pages per cgroup and add dirty limit
  interfaces in the cgroupfs:
  
   - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
  
   - Background write-out: memory.dirty_background_ratio, 
  memory.dirty_background_bytes
  
  Signed-off-by: Andrea Righi ari...@develer.com
  ---
   include/linux/memcontrol.h |   80 -
   mm/memcontrol.c|  420 
  +++-
   2 files changed, 450 insertions(+), 50 deletions(-)
  
  diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
  index 1f9b119..cc3421b 100644
  --- a/include/linux/memcontrol.h
  +++ b/include/linux/memcontrol.h
  @@ -19,12 +19,66 @@
   
   #ifndef _LINUX_MEMCONTROL_H
   #define _LINUX_MEMCONTROL_H
  +
  +#include linux/writeback.h
   #include linux/cgroup.h
  +
   struct mem_cgroup;
   struct page_cgroup;
   struct page;
   struct mm_struct;
   
  +/* Cgroup memory statistics items exported to the kernel */
  +enum mem_cgroup_page_stat_item {
  +   MEMCG_NR_DIRTYABLE_PAGES,
  +   MEMCG_NR_RECLAIM_PAGES,
  +   MEMCG_NR_WRITEBACK,
  +   MEMCG_NR_DIRTY_WRITEBACK_PAGES,
  +};
  +
  +/* Dirty memory parameters */
  +struct dirty_param {
  +   int dirty_ratio;
  +   unsigned long dirty_bytes;
  +   int dirty_background_ratio;
  +   unsigned long dirty_background_bytes;
  +};
  +
  +/*
  + * Statistics for memory cgroup.
  + */
  +enum mem_cgroup_stat_index {
  +   /*
  +* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  +*/
  +   MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
  +   MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
  +   MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
  +   MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  +   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  +   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  +   MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
  +   MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
  +   MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
  +   MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
  +   temporary buffers */
  +   MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
  +
  +   MEM_CGROUP_STAT_NSTATS,
  +};
  +
 I must have said it earlier, but I don't think exporting all of these flags
 is a good idea.
 Can you export only mem_cgroup_page_stat_item(of course, need to add 
 MEMCG_NR_FILE_MAPPED)?
 We can translate mem_cgroup_page_stat_item to mem_cgroup_stat_index by simple 
 arithmetic
 if you define MEM_CGROUP_STAT_FILE_MAPPED..MEM_CGROUP_STAT_UNSTABLE_NFS 
 sequentially.

Agreed.

 
  +/*
  + * TODO: provide a validation check routine. And retry if validation
  + * fails.
  + */
  +static inline void get_global_dirty_param(struct dirty_param *param)
  +{
  +   param-dirty_ratio = vm_dirty_ratio;
  +   param-dirty_bytes = vm_dirty_bytes;
  +   param-dirty_background_ratio = dirty_background_ratio;
  +   param-dirty_background_bytes = dirty_background_bytes;
  +}
  +
   #ifdef CONFIG_CGROUP_MEM_RES_CTLR
   /*
* All charge functions with gfp_mask should use GFP_KERNEL or
  @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct 
  mem_cgroup *memcg,
   extern int do_swap_account;
   #endif
   
  +extern bool mem_cgroup_has_dirty_limit(void);
  +extern void get_dirty_param(struct dirty_param *param);
  +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
  +
   static inline bool mem_cgroup_disabled(void)
   {
  if (mem_cgroup_subsys.disabled)
  @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
   }
   
   extern bool mem_cgroup_oom_called(struct task_struct *task);
  -void mem_cgroup_update_file_mapped(struct page *page, int val);
  +void mem_cgroup_update_stat(struct page *page,
  +   enum mem_cgroup_stat_index idx, int val);
   unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  gfp_t gfp_mask, int nid,
  int zid);
  @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
  struct task_struct *p)
   {
   }
   
  -static inline void mem_cgroup_update_file_mapped(struct page *page,
  -   int val)
  +static inline void mem_cgroup_update_stat(struct page *page,
  +   enum mem_cgroup_stat_index idx, int val)
   {
   }
   
  @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct 
  zone *zone, int order,
  return 0;
   }
   
  +static inline bool mem_cgroup_has_dirty_limit(void)
  +{
  +   return false;
  +}
  +
  +static inline void get_dirty_param(struct 

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-30 Thread Andrea Righi
On Fri, Mar 05, 2010 at 10:58:55AM +0900, KAMEZAWA Hiroyuki wrote:
 On Fri, 5 Mar 2010 10:12:34 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
  On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi ari...@develer.com wrote:
   Infrastructure to account dirty pages per cgroup and add dirty limit
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void 
   *data)
{
 int *val = data;
   @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup 
   *mem)
}

/*
   - * Currently used to update mapped file statistics, but the routine can 
   be
   - * generalized to update other statistics as well.
   + * Generalized routine to update file cache's status for memcg.
   + *
   + * Before calling this, mapping-tree_lock should be held and preemption 
   is
   + * disabled.  Then, it's guarnteed that the page is not uncharged while 
   we
   + * access page_cgroup. We can make use of that.
 */
  IIUC, mapping-tree_lock is held with irq disabled, so I think 
  mapping-tree_lock
  should be held with irq disabled would be enouth.
  And, as far as I can see, callers of this function have not ensured this 
  yet in [4/4].
  
  how about:
  
  void mem_cgroup_update_stat_locked(...)
  {
  ...
  }
  
  void mem_cgroup_update_stat_unlocked(mapping, ...)
  {
  spin_lock_irqsave(mapping-tree_lock, ...);
  mem_cgroup_update_stat_locked();
  spin_unlock_irqrestore(...);
  }
 
 Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not 
 held.
 
   lock_page_cgroup();
   mem_cgroup_update_stat_locked();
   unlock_page_cgroup();
 
 Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't 
 depend
 on migration_lock about FILE_MAPPED.

Right. I'll consider this in the next version of the patch.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread Daisuke Nishimura
On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki 
kamezawa.hir...@jp.fujitsu.com wrote:
 On Mon, 8 Mar 2010 11:17:24 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
   But IIRC, clear_writeback is done under treelock No ?
   
  The place where NR_WRITEBACK is updated is out of tree_lock.
  
 1311 int test_clear_page_writeback(struct page *page)
 1312 {
 1313 struct address_space *mapping = page_mapping(page);
 1314 int ret;
 1315
 1316 if (mapping) {
 1317 struct backing_dev_info *bdi = 
  mapping-backing_dev_info;
 1318 unsigned long flags;
 1319
 1320 spin_lock_irqsave(mapping-tree_lock, flags);
 1321 ret = TestClearPageWriteback(page);
 1322 if (ret) {
 1323 radix_tree_tag_clear(mapping-page_tree,
 1324 page_index(page),
 1325 
  PAGECACHE_TAG_WRITEBACK);
 1326 if (bdi_cap_account_writeback(bdi)) {
 1327 __dec_bdi_stat(bdi, BDI_WRITEBACK);
 1328 __bdi_writeout_inc(bdi);
 1329 }
 1330 }
 1331 spin_unlock_irqrestore(mapping-tree_lock, flags);
 1332 } else {
 1333 ret = TestClearPageWriteback(page);
 1334 }
 1335 if (ret)
 1336 dec_zone_page_state(page, NR_WRITEBACK);
 1337 return ret;
 1338 }
 
 We can move this up to under tree_lock. Considering memcg, all our target has 
 mapping.
 
 If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no 
 -mapping,
 we need much more complex new charge/uncharge theory.
 
 But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
 My concerns is performance. We may need somehing new re-implementation of
 locks/migrate/charge/uncharge.
 
I agree. Performance is my concern too.

I made a patch below and measured the time(average of 10 times) of kernel build
on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).

before
- root cgroup: 190.47 sec
- child cgroup: 192.81 sec

after
- root cgroup: 191.06 sec
- child cgroup: 193.06 sec

Hmm... about 0.3% slower for root, 0.1% slower for child.

===
From: Daisuke Nishimura nishim...@mxp.nes.nec.co.jp

In current implementation, we don't have to disable irq at lock_page_cgroup()
because the lock is never acquired in interrupt context.
But we are going to do it in later patch, so this patch encloses all of
lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().

Signed-off-by: Daisuke Nishimura nishim...@mxp.nes.nec.co.jp
---
 mm/memcontrol.c |   17 +
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 02ea959..e5ae1a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int 
val)
if (unlikely(!pc))
return;
 
+   local_irq_disable();
lock_page_cgroup(pc);
mem = pc-mem_cgroup;
if (!mem)
@@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int 
val)
 
 done:
unlock_page_cgroup(pc);
+   local_irq_enable();
 }
 
 /*
@@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
page *page)
VM_BUG_ON(!PageLocked(page));
 
pc = lookup_page_cgroup(page);
+   local_irq_disable();
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
mem = pc-mem_cgroup;
@@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
page *page)
rcu_read_unlock();
}
unlock_page_cgroup(pc);
+   local_irq_enable();
return mem;
 }
 
@@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*mem,
if (!mem)
return;
 
+   local_irq_disable();
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
+   local_irq_enable();
mem_cgroup_cancel_charge(mem);
return;
}
@@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*mem,
mem_cgroup_charge_statistics(mem, pc, true);
 
unlock_page_cgroup(pc);
+   local_irq_enable();
/*
 * charge_statistics updated event counter. Then, check it.
 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -1844,12 +1851,14 @@ static int mem_cgroup_move_account(struct page_cgroup 
*pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
int ret = -EINVAL;
+   local_irq_disable();

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread KAMEZAWA Hiroyuki
On Mon, 8 Mar 2010 17:07:11 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

 On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki 
 kamezawa.hir...@jp.fujitsu.com wrote:
  On Mon, 8 Mar 2010 11:17:24 +0900
  Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
  
But IIRC, clear_writeback is done under treelock No ?

   The place where NR_WRITEBACK is updated is out of tree_lock.
   
  1311 int test_clear_page_writeback(struct page *page)
  1312 {
  1313 struct address_space *mapping = page_mapping(page);
  1314 int ret;
  1315
  1316 if (mapping) {
  1317 struct backing_dev_info *bdi = 
   mapping-backing_dev_info;
  1318 unsigned long flags;
  1319
  1320 spin_lock_irqsave(mapping-tree_lock, flags);
  1321 ret = TestClearPageWriteback(page);
  1322 if (ret) {
  1323 radix_tree_tag_clear(mapping-page_tree,
  1324 page_index(page),
  1325 
   PAGECACHE_TAG_WRITEBACK);
  1326 if (bdi_cap_account_writeback(bdi)) {
  1327 __dec_bdi_stat(bdi, 
   BDI_WRITEBACK);
  1328 __bdi_writeout_inc(bdi);
  1329 }
  1330 }
  1331 spin_unlock_irqrestore(mapping-tree_lock, 
   flags);
  1332 } else {
  1333 ret = TestClearPageWriteback(page);
  1334 }
  1335 if (ret)
  1336 dec_zone_page_state(page, NR_WRITEBACK);
  1337 return ret;
  1338 }
  
  We can move this up to under tree_lock. Considering memcg, all our target 
  has mapping.
  
  If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no 
  -mapping,
  we need much more complex new charge/uncharge theory.
  
  But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
  My concerns is performance. We may need somehing new re-implementation of
  locks/migrate/charge/uncharge.
  
 I agree. Performance is my concern too.
 
 I made a patch below and measured the time(average of 10 times) of kernel 
 build
 on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
 
 before
 - root cgroup: 190.47 sec
 - child cgroup: 192.81 sec
 
 after
 - root cgroup: 191.06 sec
 - child cgroup: 193.06 sec
 
 Hmm... about 0.3% slower for root, 0.1% slower for child.
 

Hmm...accepatable ? (sounds it's in error-range)

BTW, why local_irq_disable() ? 
local_irq_save()/restore() isn't better ?

Thanks,
-Kame

 ===
 From: Daisuke Nishimura nishim...@mxp.nes.nec.co.jp
 
 In current implementation, we don't have to disable irq at lock_page_cgroup()
 because the lock is never acquired in interrupt context.
 But we are going to do it in later patch, so this patch encloses all of
 lock_page_cgroup()/unlock_page_cgroup() with irq_disabled()/irq_enabled().
 
 Signed-off-by: Daisuke Nishimura nishim...@mxp.nes.nec.co.jp
 ---
  mm/memcontrol.c |   17 +
  1 files changed, 17 insertions(+), 0 deletions(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 02ea959..e5ae1a1 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1359,6 +1359,7 @@ void mem_cgroup_update_file_mapped(struct page *page, 
 int val)
   if (unlikely(!pc))
   return;
  
 + local_irq_disable();
   lock_page_cgroup(pc);
   mem = pc-mem_cgroup;
   if (!mem)
 @@ -1374,6 +1375,7 @@ void mem_cgroup_update_file_mapped(struct page *page, 
 int val)
  
  done:
   unlock_page_cgroup(pc);
 + local_irq_enable();
  }
  
  /*
 @@ -1711,6 +1713,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
 page *page)
   VM_BUG_ON(!PageLocked(page));
  
   pc = lookup_page_cgroup(page);
 + local_irq_disable();
   lock_page_cgroup(pc);
   if (PageCgroupUsed(pc)) {
   mem = pc-mem_cgroup;
 @@ -1726,6 +1729,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
 page *page)
   rcu_read_unlock();
   }
   unlock_page_cgroup(pc);
 + local_irq_enable();
   return mem;
  }
  
 @@ -1742,9 +1746,11 @@ static void __mem_cgroup_commit_charge(struct 
 mem_cgroup *mem,
   if (!mem)
   return;
  
 + local_irq_disable();
   lock_page_cgroup(pc);
   if (unlikely(PageCgroupUsed(pc))) {
   unlock_page_cgroup(pc);
 + local_irq_enable();
   mem_cgroup_cancel_charge(mem);
   return;
   }
 @@ -1775,6 +1781,7 @@ static void __mem_cgroup_commit_charge(struct 
 mem_cgroup *mem,
   mem_cgroup_charge_statistics(mem, pc, true);
  
   unlock_page_cgroup(pc);
 + local_irq_enable();
   /*
* charge_statistics updated event counter. Then, check it.
* 

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread Daisuke Nishimura
On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki 
kamezawa.hir...@jp.fujitsu.com wrote:
 On Mon, 8 Mar 2010 17:07:11 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
  On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki 
  kamezawa.hir...@jp.fujitsu.com wrote:
   On Mon, 8 Mar 2010 11:17:24 +0900
   Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
   
 But IIRC, clear_writeback is done under treelock No ?
 
The place where NR_WRITEBACK is updated is out of tree_lock.

   1311 int test_clear_page_writeback(struct page *page)
   1312 {
   1313 struct address_space *mapping = page_mapping(page);
   1314 int ret;
   1315
   1316 if (mapping) {
   1317 struct backing_dev_info *bdi = 
mapping-backing_dev_info;
   1318 unsigned long flags;
   1319
   1320 spin_lock_irqsave(mapping-tree_lock, flags);
   1321 ret = TestClearPageWriteback(page);
   1322 if (ret) {
   1323 
radix_tree_tag_clear(mapping-page_tree,
   1324 
page_index(page),
   1325 
PAGECACHE_TAG_WRITEBACK);
   1326 if (bdi_cap_account_writeback(bdi)) {
   1327 __dec_bdi_stat(bdi, 
BDI_WRITEBACK);
   1328 __bdi_writeout_inc(bdi);
   1329 }
   1330 }
   1331 spin_unlock_irqrestore(mapping-tree_lock, 
flags);
   1332 } else {
   1333 ret = TestClearPageWriteback(page);
   1334 }
   1335 if (ret)
   1336 dec_zone_page_state(page, NR_WRITEBACK);
   1337 return ret;
   1338 }
   
   We can move this up to under tree_lock. Considering memcg, all our target 
   has mapping.
   
   If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no 
   -mapping,
   we need much more complex new charge/uncharge theory.
   
   But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
   My concerns is performance. We may need somehing new re-implementation of
   locks/migrate/charge/uncharge.
   
  I agree. Performance is my concern too.
  
  I made a patch below and measured the time(average of 10 times) of kernel 
  build
  on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
  
  before
  - root cgroup: 190.47 sec
  - child cgroup: 192.81 sec
  
  after
  - root cgroup: 191.06 sec
  - child cgroup: 193.06 sec
  
  Hmm... about 0.3% slower for root, 0.1% slower for child.
  
 
 Hmm...accepatable ? (sounds it's in error-range)
 
 BTW, why local_irq_disable() ? 
 local_irq_save()/restore() isn't better ?
 
I don't have any strong reason. All of lock_page_cgroup() is *now* called w/o 
irq disabled,
so I used just disable()/enable() instead of save()/restore().
I think disable()/enable() is better in those cases because we need not to 
save/restore
eflags register by pushf/popf, but, I don't have any numbers though, there 
wouldn't be a big
difference in performance.


Thanks,
Daisuke Nishimura.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread KAMEZAWA Hiroyuki
On Tue, 9 Mar 2010 01:12:52 +0100
Andrea Righi ari...@develer.com wrote:

 On Mon, Mar 08, 2010 at 05:31:00PM +0900, KAMEZAWA Hiroyuki wrote:
  On Mon, 8 Mar 2010 17:07:11 +0900
  Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
  
   On Mon, 8 Mar 2010 11:37:11 +0900, KAMEZAWA Hiroyuki 
   kamezawa.hir...@jp.fujitsu.com wrote:
On Mon, 8 Mar 2010 11:17:24 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

  But IIRC, clear_writeback is done under treelock No ?
  
 The place where NR_WRITEBACK is updated is out of tree_lock.
 
1311 int test_clear_page_writeback(struct page *page)
1312 {
1313 struct address_space *mapping = page_mapping(page);
1314 int ret;
1315
1316 if (mapping) {
1317 struct backing_dev_info *bdi = 
 mapping-backing_dev_info;
1318 unsigned long flags;
1319
1320 spin_lock_irqsave(mapping-tree_lock, flags);
1321 ret = TestClearPageWriteback(page);
1322 if (ret) {
1323 
 radix_tree_tag_clear(mapping-page_tree,
1324 
 page_index(page),
1325 
 PAGECACHE_TAG_WRITEBACK);
1326 if (bdi_cap_account_writeback(bdi)) {
1327 __dec_bdi_stat(bdi, 
 BDI_WRITEBACK);
1328 __bdi_writeout_inc(bdi);
1329 }
1330 }
1331 spin_unlock_irqrestore(mapping-tree_lock, 
 flags);
1332 } else {
1333 ret = TestClearPageWriteback(page);
1334 }
1335 if (ret)
1336 dec_zone_page_state(page, NR_WRITEBACK);
1337 return ret;
1338 }

We can move this up to under tree_lock. Considering memcg, all our 
target has mapping.

If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has 
no -mapping,
we need much more complex new charge/uncharge theory.

But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
My concerns is performance. We may need somehing new re-implementation 
of
locks/migrate/charge/uncharge.

   I agree. Performance is my concern too.
   
   I made a patch below and measured the time(average of 10 times) of kernel 
   build
   on tmpfs(make -j8 on 8 CPU machine with 2.6.33 defconfig).
   
   before
   - root cgroup: 190.47 sec
   - child cgroup: 192.81 sec
   
   after
   - root cgroup: 191.06 sec
   - child cgroup: 193.06 sec
   
   Hmm... about 0.3% slower for root, 0.1% slower for child.
   
  
  Hmm...accepatable ? (sounds it's in error-range)
  
  BTW, why local_irq_disable() ? 
  local_irq_save()/restore() isn't better ?
 
 Probably there's not the overhead of saving flags? 
maybe.

 Anyway, it would make the code much more readable...
 
ok.

please go ahead in this direction. Nishimura-san, would you post an
independent patch ? If no, Andrea-san, please.

Thanks,
-Kame

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread KAMEZAWA Hiroyuki
On Tue, 9 Mar 2010 09:18:45 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

 On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki 
 kamezawa.hir...@jp.fujitsu.com wrote:
  On Mon, 8 Mar 2010 17:07:11 +0900
  Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

  Hmm...accepatable ? (sounds it's in error-range)
  
  BTW, why local_irq_disable() ? 
  local_irq_save()/restore() isn't better ?
  
 I don't have any strong reason. All of lock_page_cgroup() is *now* called w/o 
 irq disabled,
 so I used just disable()/enable() instead of save()/restore().

My point is, this will be used under treelock soon.

Thanks,
-Kame

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-08 Thread Daisuke Nishimura
On Tue, 9 Mar 2010 09:20:54 +0900, KAMEZAWA Hiroyuki 
kamezawa.hir...@jp.fujitsu.com wrote:
 On Tue, 9 Mar 2010 09:18:45 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
  On Mon, 8 Mar 2010 17:31:00 +0900, KAMEZAWA Hiroyuki 
  kamezawa.hir...@jp.fujitsu.com wrote:
   On Mon, 8 Mar 2010 17:07:11 +0900
   Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
   Hmm...accepatable ? (sounds it's in error-range)
   
   BTW, why local_irq_disable() ? 
   local_irq_save()/restore() isn't better ?
   
  I don't have any strong reason. All of lock_page_cgroup() is *now* called 
  w/o irq disabled,
  so I used just disable()/enable() instead of save()/restore().
 
 My point is, this will be used under treelock soon.
 
I agree.

I'll update the patch using save()/restore(), and repost later.


Thanks,
Daisuke Nishimura.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-07 Thread Daisuke Nishimura
 +/*
 + * mem_cgroup_update_page_stat_locked() - update memcg file cache's 
 accounting
 + * @page:the page involved in a file cache operation.
 + * @idx: the particular file cache statistic.
 + * @charge:  true to increment, false to decrement the statistic specified
 + *   by @idx.
 + *
 + * Update memory cgroup file cache's accounting from a locked context.
 + *
 + * NOTE: must be called with mapping-tree_lock held.
 + */
 +void mem_cgroup_update_page_stat_locked(struct page *page,
 + enum mem_cgroup_write_page_stat_item idx, bool charge)
 +{
 + struct address_space *mapping = page_mapping(page);
 + struct page_cgroup *pc;
 +
 + if (mem_cgroup_disabled())
 + return;
 + WARN_ON_ONCE(!irqs_disabled());
 + WARN_ON_ONCE(mapping  !spin_is_locked(mapping-tree_lock));
 +
I think this is a wrong place to insert assertion.
The problem about page cgroup lock is that it can be interrupted in current 
implementation.
So,

a) it must not be aquired under another lock which can be aquired in interrupt 
context,
   such as mapping-tree_lock, to avoid:

context1context2
lock_page_cgroup(pcA)
spin_lock_irq(tree_lock)
lock_page_cgroup(pcA)   interrupted
=fail  spin_lock_irqsave(tree_lock)
=fail

b) it must not be aquired in interrupt context to avoid:

lock_page_cgroup(pcA)
interrupted
lock_page_cgroup(pcA)
=fail

I think something like this would be better:

@@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct 
page_cgroup *pc)
return page_zonenum(pc-page);
 }

+#include linux/irqflags.h
+#include linux/hardirq.h
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
+#ifdef CONFIG_DEBUG_VM
+   WARN_ON_ONCE(irqs_disabled());
+   WARN_ON_ONCE(in_interrupt());
+#endif
bit_spin_lock(PCG_LOCK, pc-flags);
 }

 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc) || !PageCgroupUsed(pc))
 + return;
 + mem_cgroup_update_page_stat(pc, idx, charge);
 +}
 +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
 +
 +/*
 + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's 
 accounting
 + * @page:the page involved in a file cache operation.
 + * @idx: the particular file cache statistic.
 + * @charge:  true to increment, false to decrement the statistic specified
 + *   by @idx.
 + *
 + * Update memory cgroup file cache's accounting from an unlocked context.
 + */
 +void mem_cgroup_update_page_stat_unlocked(struct page *page,
 + enum mem_cgroup_write_page_stat_item idx, bool charge)
 +{
 + struct page_cgroup *pc;
 +
 + if (mem_cgroup_disabled())
 + return;
 + pc = lookup_page_cgroup(page);
 + if (unlikely(!pc) || !PageCgroupUsed(pc))
 + return;
 + lock_page_cgroup(pc);
 + mem_cgroup_update_page_stat(pc, idx, charge);
   unlock_page_cgroup(pc);
  }
 +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
  
IIUC, test_clear_page_writeback(at least) can be called under interrupt context.
This means lock_page_cgroup() is called under interrupt context, that is,
the case b) above can happen.
hmm... I don't have any good idea for now except disabling irq around page 
cgroup lock
to avoid all of these mess things.


Thanks,
Daisuke Nishimura.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-07 Thread KAMEZAWA Hiroyuki
On Mon, 8 Mar 2010 10:44:47 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

  +/*
  + * mem_cgroup_update_page_stat_locked() - update memcg file cache's 
  accounting
  + * @page:  the page involved in a file cache operation.
  + * @idx:   the particular file cache statistic.
  + * @charge:true to increment, false to decrement the statistic 
  specified
  + * by @idx.
  + *
  + * Update memory cgroup file cache's accounting from a locked context.
  + *
  + * NOTE: must be called with mapping-tree_lock held.
  + */
  +void mem_cgroup_update_page_stat_locked(struct page *page,
  +   enum mem_cgroup_write_page_stat_item idx, bool charge)
  +{
  +   struct address_space *mapping = page_mapping(page);
  +   struct page_cgroup *pc;
  +
  +   if (mem_cgroup_disabled())
  +   return;
  +   WARN_ON_ONCE(!irqs_disabled());
  +   WARN_ON_ONCE(mapping  !spin_is_locked(mapping-tree_lock));
  +
 I think this is a wrong place to insert assertion.
 The problem about page cgroup lock is that it can be interrupted in current 
 implementation.
 So,
 
 a) it must not be aquired under another lock which can be aquired in 
 interrupt context,
such as mapping-tree_lock, to avoid:
 
   context1context2
   lock_page_cgroup(pcA)
   spin_lock_irq(tree_lock)
   lock_page_cgroup(pcA)   interrupted
   =fail  spin_lock_irqsave(tree_lock)
   =fail
 
 b) it must not be aquired in interrupt context to avoid:
 
   lock_page_cgroup(pcA)
   interrupted
   lock_page_cgroup(pcA)
   =fail
 
 I think something like this would be better:
 
 @@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct 
 page_cgroup *pc)
 return page_zonenum(pc-page);
  }
 
 +#include linux/irqflags.h
 +#include linux/hardirq.h
  static inline void lock_page_cgroup(struct page_cgroup *pc)
  {
 +#ifdef CONFIG_DEBUG_VM
 +   WARN_ON_ONCE(irqs_disabled());
 +   WARN_ON_ONCE(in_interrupt());
 +#endif
 bit_spin_lock(PCG_LOCK, pc-flags);
  }
 
  +   pc = lookup_page_cgroup(page);
  +   if (unlikely(!pc) || !PageCgroupUsed(pc))
  +   return;
  +   mem_cgroup_update_page_stat(pc, idx, charge);
  +}
  +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
  +
  +/*
  + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's 
  accounting
  + * @page:  the page involved in a file cache operation.
  + * @idx:   the particular file cache statistic.
  + * @charge:true to increment, false to decrement the statistic 
  specified
  + * by @idx.
  + *
  + * Update memory cgroup file cache's accounting from an unlocked context.
  + */
  +void mem_cgroup_update_page_stat_unlocked(struct page *page,
  +   enum mem_cgroup_write_page_stat_item idx, bool charge)
  +{
  +   struct page_cgroup *pc;
  +
  +   if (mem_cgroup_disabled())
  +   return;
  +   pc = lookup_page_cgroup(page);
  +   if (unlikely(!pc) || !PageCgroupUsed(pc))
  +   return;
  +   lock_page_cgroup(pc);
  +   mem_cgroup_update_page_stat(pc, idx, charge);
  unlock_page_cgroup(pc);
   }
  +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);
   
 IIUC, test_clear_page_writeback(at least) can be called under interrupt 
 context.
 This means lock_page_cgroup() is called under interrupt context, that is,
 the case b) above can happen.
 hmm... I don't have any good idea for now except disabling irq around page 
 cgroup lock
 to avoid all of these mess things.
 

Hmm...simply IRQ-off for all updates ?
But IIRC, clear_writeback is done under treelock No ?

Thanks,
-Kame

 
 Thanks,
 Daisuke Nishimura.
 

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-07 Thread Daisuke Nishimura
On Mon, 8 Mar 2010 10:56:41 +0900, KAMEZAWA Hiroyuki 
kamezawa.hir...@jp.fujitsu.com wrote:
 On Mon, 8 Mar 2010 10:44:47 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
   +/*
   + * mem_cgroup_update_page_stat_locked() - update memcg file cache's 
   accounting
   + * @page:the page involved in a file cache operation.
   + * @idx: the particular file cache statistic.
   + * @charge:  true to increment, false to decrement the statistic 
   specified
   + *   by @idx.
   + *
   + * Update memory cgroup file cache's accounting from a locked context.
   + *
   + * NOTE: must be called with mapping-tree_lock held.
   + */
   +void mem_cgroup_update_page_stat_locked(struct page *page,
   + enum mem_cgroup_write_page_stat_item idx, bool charge)
   +{
   + struct address_space *mapping = page_mapping(page);
   + struct page_cgroup *pc;
   +
   + if (mem_cgroup_disabled())
   + return;
   + WARN_ON_ONCE(!irqs_disabled());
   + WARN_ON_ONCE(mapping  !spin_is_locked(mapping-tree_lock));
   +
  I think this is a wrong place to insert assertion.
  The problem about page cgroup lock is that it can be interrupted in current 
  implementation.
  So,
  
  a) it must not be aquired under another lock which can be aquired in 
  interrupt context,
 such as mapping-tree_lock, to avoid:
  
  context1context2
  lock_page_cgroup(pcA)
  spin_lock_irq(tree_lock)
  lock_page_cgroup(pcA)   interrupted
  =fail  spin_lock_irqsave(tree_lock)
  =fail
  
  b) it must not be aquired in interrupt context to avoid:
  
  lock_page_cgroup(pcA)
  interrupted
  lock_page_cgroup(pcA)
  =fail
  
  I think something like this would be better:
  
  @@ -83,8 +83,14 @@ static inline enum zone_type page_cgroup_zid(struct 
  page_cgroup *pc)
  return page_zonenum(pc-page);
   }
  
  +#include linux/irqflags.h
  +#include linux/hardirq.h
   static inline void lock_page_cgroup(struct page_cgroup *pc)
   {
  +#ifdef CONFIG_DEBUG_VM
  +   WARN_ON_ONCE(irqs_disabled());
  +   WARN_ON_ONCE(in_interrupt());
  +#endif
  bit_spin_lock(PCG_LOCK, pc-flags);
   }
  
   + pc = lookup_page_cgroup(page);
   + if (unlikely(!pc) || !PageCgroupUsed(pc))
   + return;
   + mem_cgroup_update_page_stat(pc, idx, charge);
   +}
   +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_locked);
   +
   +/*
   + * mem_cgroup_update_page_stat_unlocked() - update memcg file cache's 
   accounting
   + * @page:the page involved in a file cache operation.
   + * @idx: the particular file cache statistic.
   + * @charge:  true to increment, false to decrement the statistic 
   specified
   + *   by @idx.
   + *
   + * Update memory cgroup file cache's accounting from an unlocked context.
   + */
   +void mem_cgroup_update_page_stat_unlocked(struct page *page,
   + enum mem_cgroup_write_page_stat_item idx, bool charge)
   +{
   + struct page_cgroup *pc;
   +
   + if (mem_cgroup_disabled())
   + return;
   + pc = lookup_page_cgroup(page);
   + if (unlikely(!pc) || !PageCgroupUsed(pc))
   + return;
   + lock_page_cgroup(pc);
   + mem_cgroup_update_page_stat(pc, idx, charge);
 unlock_page_cgroup(pc);
}
   +EXPORT_SYMBOL_GPL(mem_cgroup_update_page_stat_unlocked);

  IIUC, test_clear_page_writeback(at least) can be called under interrupt 
  context.
  This means lock_page_cgroup() is called under interrupt context, that is,
  the case b) above can happen.
  hmm... I don't have any good idea for now except disabling irq around page 
  cgroup lock
  to avoid all of these mess things.
  
 
 Hmm...simply IRQ-off for all updates ?
I think so in current code.
But after these changes, we must use local_irq_save()/restore()
instead of local_irq_disable()/enable() in mem_cgroup_update_page_stat().

 But IIRC, clear_writeback is done under treelock No ?
 
The place where NR_WRITEBACK is updated is out of tree_lock.

   1311 int test_clear_page_writeback(struct page *page)
   1312 {
   1313 struct address_space *mapping = page_mapping(page);
   1314 int ret;
   1315
   1316 if (mapping) {
   1317 struct backing_dev_info *bdi = 
mapping-backing_dev_info;
   1318 unsigned long flags;
   1319
   1320 spin_lock_irqsave(mapping-tree_lock, flags);
   1321 ret = TestClearPageWriteback(page);
   1322 if (ret) {
   1323 radix_tree_tag_clear(mapping-page_tree,
   1324 page_index(page),
   1325 
PAGECACHE_TAG_WRITEBACK);
   1326 if (bdi_cap_account_writeback(bdi)) {
   1327   

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-07 Thread KAMEZAWA Hiroyuki
On Mon, 8 Mar 2010 11:17:24 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

  But IIRC, clear_writeback is done under treelock No ?
  
 The place where NR_WRITEBACK is updated is out of tree_lock.
 
1311 int test_clear_page_writeback(struct page *page)
1312 {
1313 struct address_space *mapping = page_mapping(page);
1314 int ret;
1315
1316 if (mapping) {
1317 struct backing_dev_info *bdi = 
 mapping-backing_dev_info;
1318 unsigned long flags;
1319
1320 spin_lock_irqsave(mapping-tree_lock, flags);
1321 ret = TestClearPageWriteback(page);
1322 if (ret) {
1323 radix_tree_tag_clear(mapping-page_tree,
1324 page_index(page),
1325 
 PAGECACHE_TAG_WRITEBACK);
1326 if (bdi_cap_account_writeback(bdi)) {
1327 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1328 __bdi_writeout_inc(bdi);
1329 }
1330 }
1331 spin_unlock_irqrestore(mapping-tree_lock, flags);
1332 } else {
1333 ret = TestClearPageWriteback(page);
1334 }
1335 if (ret)
1336 dec_zone_page_state(page, NR_WRITEBACK);
1337 return ret;
1338 }

We can move this up to under tree_lock. Considering memcg, all our target has 
mapping.

If we newly account bounce-buffers (for NILFS, FUSE, etc..), which has no 
-mapping,
we need much more complex new charge/uncharge theory.

But yes, adding new lock scheme seems complicated. (Sorry Andrea.)
My concerns is performance. We may need somehing new re-implementation of
locks/migrate/charge/uncharge.

Thanks,
-Kame



___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-04 Thread Kirill A. Shutemov
On Thu, Mar 4, 2010 at 12:40 PM, Andrea Righi ari...@develer.com wrote:
 Infrastructure to account dirty pages per cgroup and add dirty limit
 interfaces in the cgroupfs:

  - Direct write-out: memory.dirty_ratio, memory.dirty_bytes

  - Background write-out: memory.dirty_background_ratio, 
 memory.dirty_background_bytes

 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  include/linux/memcontrol.h |   80 -
  mm/memcontrol.c            |  420 
 +++-
  2 files changed, 450 insertions(+), 50 deletions(-)

 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..cc3421b 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -19,12 +19,66 @@

  #ifndef _LINUX_MEMCONTROL_H
  #define _LINUX_MEMCONTROL_H
 +
 +#include linux/writeback.h
  #include linux/cgroup.h
 +
  struct mem_cgroup;
  struct page_cgroup;
  struct page;
  struct mm_struct;

 +/* Cgroup memory statistics items exported to the kernel */
 +enum mem_cgroup_page_stat_item {
 +       MEMCG_NR_DIRTYABLE_PAGES,
 +       MEMCG_NR_RECLAIM_PAGES,
 +       MEMCG_NR_WRITEBACK,
 +       MEMCG_NR_DIRTY_WRITEBACK_PAGES,
 +};
 +
 +/* Dirty memory parameters */
 +struct dirty_param {
 +       int dirty_ratio;
 +       unsigned long dirty_bytes;
 +       int dirty_background_ratio;
 +       unsigned long dirty_background_bytes;
 +};
 +
 +/*
 + * Statistics for memory cgroup.
 + */
 +enum mem_cgroup_stat_index {
 +       /*
 +        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 +        */
 +       MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
 +       MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
 +       MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 +       MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
 +       MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
 +       MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 +       MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
 +       MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 +       MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 +       MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 +                                               temporary buffers */
 +       MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
 +
 +       MEM_CGROUP_STAT_NSTATS,
 +};
 +
 +/*
 + * TODO: provide a validation check routine. And retry if validation
 + * fails.
 + */
 +static inline void get_global_dirty_param(struct dirty_param *param)
 +{
 +       param-dirty_ratio = vm_dirty_ratio;
 +       param-dirty_bytes = vm_dirty_bytes;
 +       param-dirty_background_ratio = dirty_background_ratio;
 +       param-dirty_background_bytes = dirty_background_bytes;
 +}
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
  * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif

 +extern bool mem_cgroup_has_dirty_limit(void);
 +extern void get_dirty_param(struct dirty_param *param);
 +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
        if (mem_cgroup_subsys.disabled)
 @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
  }

  extern bool mem_cgroup_oom_called(struct task_struct *task);
 -void mem_cgroup_update_file_mapped(struct page *page, int val);
 +void mem_cgroup_update_stat(struct page *page,
 +                       enum mem_cgroup_stat_index idx, int val);
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                gfp_t gfp_mask, int nid,
                                                int zid);
 @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
 struct task_struct *p)
  {
  }

 -static inline void mem_cgroup_update_file_mapped(struct page *page,
 -                                                       int val)
 +static inline void mem_cgroup_update_stat(struct page *page,
 +                       enum mem_cgroup_stat_index idx, int val)
  {
  }

 @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
        return 0;
  }

 +static inline bool mem_cgroup_has_dirty_limit(void)
 +{
 +       return false;
 +}
 +
 +static inline void get_dirty_param(struct dirty_param *param)
 +{
 +       get_global_dirty_param(param);
 +}
 +
 +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
 +{
 +       return -ENOSYS;
 +}
 +
  #endif /* CONFIG_CGROUP_MEM_CONT */

  #endif /* _LINUX_MEMCONTROL_H */
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 497b6f7..9842e7b 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -73,28 +73,23 @@ static int really_do_swap_account __initdata = 1; /* for 
 

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-04 Thread Daisuke Nishimura
On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi ari...@develer.com wrote:
 Infrastructure to account dirty pages per cgroup and add dirty limit
 interfaces in the cgroupfs:
 
  - Direct write-out: memory.dirty_ratio, memory.dirty_bytes
 
  - Background write-out: memory.dirty_background_ratio, 
 memory.dirty_background_bytes
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  include/linux/memcontrol.h |   80 -
  mm/memcontrol.c|  420 
 +++-
  2 files changed, 450 insertions(+), 50 deletions(-)
 
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 1f9b119..cc3421b 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -19,12 +19,66 @@
  
  #ifndef _LINUX_MEMCONTROL_H
  #define _LINUX_MEMCONTROL_H
 +
 +#include linux/writeback.h
  #include linux/cgroup.h
 +
  struct mem_cgroup;
  struct page_cgroup;
  struct page;
  struct mm_struct;
  
 +/* Cgroup memory statistics items exported to the kernel */
 +enum mem_cgroup_page_stat_item {
 + MEMCG_NR_DIRTYABLE_PAGES,
 + MEMCG_NR_RECLAIM_PAGES,
 + MEMCG_NR_WRITEBACK,
 + MEMCG_NR_DIRTY_WRITEBACK_PAGES,
 +};
 +
 +/* Dirty memory parameters */
 +struct dirty_param {
 + int dirty_ratio;
 + unsigned long dirty_bytes;
 + int dirty_background_ratio;
 + unsigned long dirty_background_bytes;
 +};
 +
 +/*
 + * Statistics for memory cgroup.
 + */
 +enum mem_cgroup_stat_index {
 + /*
 +  * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 +  */
 + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
 + MEM_CGROUP_STAT_RSS,   /* # of pages charged as anon rss */
 + MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 + MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
 + MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
 + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 + MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
 + MEM_CGROUP_STAT_FILE_DIRTY,   /* # of dirty pages in page cache */
 + MEM_CGROUP_STAT_WRITEBACK,   /* # of pages under writeback */
 + MEM_CGROUP_STAT_WRITEBACK_TEMP,   /* # of pages under writeback using
 + temporary buffers */
 + MEM_CGROUP_STAT_UNSTABLE_NFS,   /* # of NFS unstable pages */
 +
 + MEM_CGROUP_STAT_NSTATS,
 +};
 +
I must have said it earlier, but I don't think exporting all of these flags
is a good idea.
Can you export only mem_cgroup_page_stat_item(of course, need to add 
MEMCG_NR_FILE_MAPPED)?
We can translate mem_cgroup_page_stat_item to mem_cgroup_stat_index by simple 
arithmetic
if you define MEM_CGROUP_STAT_FILE_MAPPED..MEM_CGROUP_STAT_UNSTABLE_NFS 
sequentially.

 +/*
 + * TODO: provide a validation check routine. And retry if validation
 + * fails.
 + */
 +static inline void get_global_dirty_param(struct dirty_param *param)
 +{
 + param-dirty_ratio = vm_dirty_ratio;
 + param-dirty_bytes = vm_dirty_bytes;
 + param-dirty_background_ratio = dirty_background_ratio;
 + param-dirty_background_bytes = dirty_background_bytes;
 +}
 +
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  /*
   * All charge functions with gfp_mask should use GFP_KERNEL or
 @@ -117,6 +171,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup 
 *memcg,
  extern int do_swap_account;
  #endif
  
 +extern bool mem_cgroup_has_dirty_limit(void);
 +extern void get_dirty_param(struct dirty_param *param);
 +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
 +
  static inline bool mem_cgroup_disabled(void)
  {
   if (mem_cgroup_subsys.disabled)
 @@ -125,7 +183,8 @@ static inline bool mem_cgroup_disabled(void)
  }
  
  extern bool mem_cgroup_oom_called(struct task_struct *task);
 -void mem_cgroup_update_file_mapped(struct page *page, int val);
 +void mem_cgroup_update_stat(struct page *page,
 + enum mem_cgroup_stat_index idx, int val);
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
   gfp_t gfp_mask, int nid,
   int zid);
 @@ -300,8 +359,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 
 struct task_struct *p)
  {
  }
  
 -static inline void mem_cgroup_update_file_mapped(struct page *page,
 - int val)
 +static inline void mem_cgroup_update_stat(struct page *page,
 + enum mem_cgroup_stat_index idx, int val)
  {
  }
  
 @@ -312,6 +371,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone 
 *zone, int order,
   return 0;
  }
  
 +static inline bool mem_cgroup_has_dirty_limit(void)
 +{
 + return false;
 +}
 +
 +static inline void get_dirty_param(struct dirty_param *param)
 +{
 + get_global_dirty_param(param);
 +}
 +
 +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
 

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-04 Thread KAMEZAWA Hiroyuki
On Fri, 5 Mar 2010 10:12:34 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

 On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi ari...@develer.com wrote:
  Infrastructure to account dirty pages per cgroup and add dirty limit
   static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
   {
  int *val = data;
  @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup *mem)
   }
   
   /*
  - * Currently used to update mapped file statistics, but the routine can be
  - * generalized to update other statistics as well.
  + * Generalized routine to update file cache's status for memcg.
  + *
  + * Before calling this, mapping-tree_lock should be held and preemption is
  + * disabled.  Then, it's guarnteed that the page is not uncharged while we
  + * access page_cgroup. We can make use of that.
*/
 IIUC, mapping-tree_lock is held with irq disabled, so I think 
 mapping-tree_lock
 should be held with irq disabled would be enouth.
 And, as far as I can see, callers of this function have not ensured this yet 
 in [4/4].
 
 how about:
 
   void mem_cgroup_update_stat_locked(...)
   {
   ...
   }
 
   void mem_cgroup_update_stat_unlocked(mapping, ...)
   {
   spin_lock_irqsave(mapping-tree_lock, ...);
   mem_cgroup_update_stat_locked();
   spin_unlock_irqrestore(...);
   }

Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not held.

lock_page_cgroup();
mem_cgroup_update_stat_locked();
unlock_page_cgroup();

Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't depend
on migration_lock about FILE_MAPPED.


 
  -void mem_cgroup_update_file_mapped(struct page *page, int val)
  +void mem_cgroup_update_stat(struct page *page,
  +   enum mem_cgroup_stat_index idx, int val)
   {
 I preffer void mem_cgroup_update_page_stat(struct page *, enum 
 mem_cgroup_page_stat_item, ..)
 as I said above.
 
  struct mem_cgroup *mem;
  struct page_cgroup *pc;
   
  +   if (mem_cgroup_disabled())
  +   return;
  pc = lookup_page_cgroup(page);
  -   if (unlikely(!pc))
  +   if (unlikely(!pc) || !PageCgroupUsed(pc))
  return;
   
  -   lock_page_cgroup(pc);
  -   mem = pc-mem_cgroup;
  -   if (!mem)
  -   goto done;
  -
  -   if (!PageCgroupUsed(pc))
  -   goto done;
  -
  +   lock_page_cgroup_migrate(pc);
  /*
  -* Preemption is already disabled. We can use __this_cpu_xxx
  -*/
  -   __this_cpu_add(mem-stat-count[MEM_CGROUP_STAT_FILE_MAPPED], val);
  -
  -done:
  -   unlock_page_cgroup(pc);
  +   * It's guarnteed that this page is never uncharged.
  +   * The only racy problem is moving account among memcgs.
  +   */
  +   switch (idx) {
  +   case MEM_CGROUP_STAT_FILE_MAPPED:
  +   if (val  0)
  +   SetPageCgroupFileMapped(pc);
  +   else
  +   ClearPageCgroupFileMapped(pc);
  +   break;
  +   case MEM_CGROUP_STAT_FILE_DIRTY:
  +   if (val  0)
  +   SetPageCgroupDirty(pc);
  +   else
  +   ClearPageCgroupDirty(pc);
  +   break;
  +   case MEM_CGROUP_STAT_WRITEBACK:
  +   if (val  0)
  +   SetPageCgroupWriteback(pc);
  +   else
  +   ClearPageCgroupWriteback(pc);
  +   break;
  +   case MEM_CGROUP_STAT_WRITEBACK_TEMP:
  +   if (val  0)
  +   SetPageCgroupWritebackTemp(pc);
  +   else
  +   ClearPageCgroupWritebackTemp(pc);
  +   break;
  +   case MEM_CGROUP_STAT_UNSTABLE_NFS:
  +   if (val  0)
  +   SetPageCgroupUnstableNFS(pc);
  +   else
  +   ClearPageCgroupUnstableNFS(pc);
  +   break;
  +   default:
  +   BUG();
  +   break;
  +   }
  +   mem = pc-mem_cgroup;
  +   if (likely(mem))
  +   __this_cpu_add(mem-stat-count[idx], val);
  +   unlock_page_cgroup_migrate(pc);
   }
  +EXPORT_SYMBOL_GPL(mem_cgroup_update_stat);
   
   /*
* size of first charge trial. 32 comes from vmscan.c's magic value.
  @@ -1701,6 +1885,45 @@ static void __mem_cgroup_commit_charge(struct 
  mem_cgroup *mem,
  memcg_check_events(mem, pc-page);
   }
   
  +/*
  + * Update file cache accounted statistics on task migration.
  + *
  + * TODO: We don't move charges of file (including shmem/tmpfs) pages for 
  now.
  + * So, at the moment this function simply returns without updating 
  accounted
  + * statistics, because we deal only with anonymous pages here.
  + */
 This function is not unique to task migration. It's called from rmdir() too.
 So this comment isn't needed.
 
  +static void __mem_cgroup_update_file_stat(struct page_cgroup *pc,
  +   struct mem_cgroup *from, struct mem_cgroup *to)
  +{
  +   struct page *page = pc-page;
  +
  +   if (!page_mapped(page) || 

[Devel] Re: [PATCH -mmotm 3/4] memcg: dirty pages accounting and limiting infrastructure

2010-03-04 Thread Balbir Singh
* KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-05 10:58:55]:

 On Fri, 5 Mar 2010 10:12:34 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
  On Thu,  4 Mar 2010 11:40:14 +0100, Andrea Righi ari...@develer.com wrote:
   Infrastructure to account dirty pages per cgroup and add dirty limit
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void 
   *data)
{
 int *val = data;
   @@ -1275,34 +1423,70 @@ static void record_last_oom(struct mem_cgroup 
   *mem)
}

/*
   - * Currently used to update mapped file statistics, but the routine can 
   be
   - * generalized to update other statistics as well.
   + * Generalized routine to update file cache's status for memcg.
   + *
   + * Before calling this, mapping-tree_lock should be held and preemption 
   is
   + * disabled.  Then, it's guarnteed that the page is not uncharged while 
   we
   + * access page_cgroup. We can make use of that.
 */
  IIUC, mapping-tree_lock is held with irq disabled, so I think 
  mapping-tree_lock
  should be held with irq disabled would be enouth.
  And, as far as I can see, callers of this function have not ensured this 
  yet in [4/4].
  
  how about:
  
  void mem_cgroup_update_stat_locked(...)
  {
  ...
  }
  
  void mem_cgroup_update_stat_unlocked(mapping, ...)
  {
  spin_lock_irqsave(mapping-tree_lock, ...);
  mem_cgroup_update_stat_locked();
  spin_unlock_irqrestore(...);
  }
 
 Rather than tree_lock, lock_page_cgroup() can be used if tree_lock is not 
 held.
 
   lock_page_cgroup();
   mem_cgroup_update_stat_locked();
   unlock_page_cgroup();
 
 Andrea-san, FILE_MAPPED is updated without treelock, at least. You can't 
 depend
 on migration_lock about FILE_MAPPED.


FILE_MAPPED is updated under pte lock in the rmap context and
page_cgroup lock within update_file_mapped.
 

-- 
Three Cheers,
Balbir
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel