[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
  @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
   dirty_thresh += dirty_thresh / 10;  /* wh... */
   
  -if (global_page_state(NR_UNSTABLE_NFS) +
  -   global_page_state(NR_WRITEBACK) = dirty_thresh)
  -   break;
  -congestion_wait(BLK_RW_ASYNC, HZ/10);
  +
  +   dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
  +   if (dirty  0)
  +   dirty = global_page_state(NR_UNSTABLE_NFS) +
  +   global_page_state(NR_WRITEBACK);
 
 dirty is unsigned long. As mentioned last time, above will never be true?
 In general these patches look ok to me. I will do some testing with these.

Re-introduced the same bug. My bad. :(

The value returned from mem_cgroup_page_stat() can be negative, i.e.
when memory cgroup is disabled. We could simply use a long for dirty,
the unit is in # of pages so s64 should be enough. Or cast dirty to long
only for the check (see below).

Thanks!
-Andrea

Signed-off-by: Andrea Righi ari...@develer.com
---
 mm/page-writeback.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d83f41c..dbee976 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
 
dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
-   if (dirty  0)
+   if ((long)dirty  0)
dirty = global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK);
if (dirty = dirty_thresh)
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
 On Mon,  1 Mar 2010 22:23:40 +0100
 Andrea Righi ari...@develer.com wrote:
 
  Apply the cgroup dirty pages accounting and limiting infrastructure to
  the opportune kernel functions.
  
  Signed-off-by: Andrea Righi ari...@develer.com
 
 Seems nice.
 
 Hmm. the last problem is moving account between memcg.
 
 Right ?

Correct. This was actually the last item of the TODO list. Anyway, I'm
still considering if it's correct to move dirty pages when a task is
migrated from a cgroup to another. Currently, dirty pages just remain in
the original cgroup and are flushed depending on the original cgroup
settings. That is not totally wrong... at least moving the dirty pages
between memcgs should be optional (move_charge_at_immigrate?).

Thanks for your ack and the detailed review!

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote:
 On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote:
  Apply the cgroup dirty pages accounting and limiting infrastructure to
  the opportune kernel functions.
 
  Signed-off-by: Andrea Righi ari...@develer.com
  ---
   fs/fuse/file.c      |    5 +++
   fs/nfs/write.c      |    4 ++
   fs/nilfs2/segment.c |   10 +-
   mm/filemap.c        |    1 +
   mm/page-writeback.c |   84 
  --
   mm/rmap.c           |    4 +-
   mm/truncate.c       |    2 +
   7 files changed, 76 insertions(+), 34 deletions(-)
 
  diff --git a/fs/fuse/file.c b/fs/fuse/file.c
  index a9f5e13..dbbdd53 100644
  --- a/fs/fuse/file.c
  +++ b/fs/fuse/file.c
  @@ -11,6 +11,7 @@
   #include linux/pagemap.h
   #include linux/slab.h
   #include linux/kernel.h
  +#include linux/memcontrol.h
   #include linux/sched.h
   #include linux/module.h
 
  @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn 
  *fc, struct fuse_req *req)
 
         list_del(req-writepages_entry);
         dec_bdi_stat(bdi, BDI_WRITEBACK);
  +       mem_cgroup_update_stat(req-pages[0],
  +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
         dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
         bdi_writeout_inc(bdi);
         wake_up(fi-page_waitq);
  @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
         req-inode = inode;
 
         inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
  +       mem_cgroup_update_stat(tmp_page,
  +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
         end_page_writeback(page);
 
  diff --git a/fs/nfs/write.c b/fs/nfs/write.c
  index b753242..7316f7a 100644
  --- a/fs/nfs/write.c
  +++ b/fs/nfs/write.c
  @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
                         req-wb_index,
                         NFS_PAGE_TAG_COMMIT);
         spin_unlock(inode-i_lock);
  +       mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 
  1);
         inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
         inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
  @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
         struct page *page = req-wb_page;
 
         if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
  +               mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, 
  -1);
                 dec_zone_page_state(page, NR_UNSTABLE_NFS);
                 dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
                 return 1;
  @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head 
  *head, int how)
                 req = nfs_list_entry(head-next);
                 nfs_list_remove_request(req);
                 nfs_mark_request_commit(req);
  +               mem_cgroup_update_stat(req-wb_page,
  +                               MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
                 dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
                 dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
                                 BDI_UNSTABLE);
  diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
  index ada2f1b..aef6d13 100644
  --- a/fs/nilfs2/segment.c
  +++ b/fs/nilfs2/segment.c
  @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
  struct list_head *out)
         } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
         kunmap_atomic(kaddr, KM_USER0);
 
  -       if (!TestSetPageWriteback(clone_page))
  +       if (!TestSetPageWriteback(clone_page)) {
  +               mem_cgroup_update_stat(clone_page,
 
 s/clone_page/page/

mmh... shouldn't we use the same page used by TestSetPageWriteback() and
inc_zone_page_state()?

 
 And #include linux/memcontrol.h is missed.

OK.

I'll apply your fixes and post a new version.

Thanks for reviewing,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 01:09:24PM +0200, Kirill A. Shutemov wrote:
 On Tue, Mar 2, 2010 at 1:02 PM, Andrea Righi ari...@develer.com wrote:
  On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote:
  On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote:
   Apply the cgroup dirty pages accounting and limiting infrastructure to
   the opportune kernel functions.
  
   Signed-off-by: Andrea Righi ari...@develer.com
   ---
    fs/fuse/file.c      |    5 +++
    fs/nfs/write.c      |    4 ++
    fs/nilfs2/segment.c |   10 +-
    mm/filemap.c        |    1 +
    mm/page-writeback.c |   84 
   --
    mm/rmap.c           |    4 +-
    mm/truncate.c       |    2 +
    7 files changed, 76 insertions(+), 34 deletions(-)
  
   diff --git a/fs/fuse/file.c b/fs/fuse/file.c
   index a9f5e13..dbbdd53 100644
   --- a/fs/fuse/file.c
   +++ b/fs/fuse/file.c
   @@ -11,6 +11,7 @@
    #include linux/pagemap.h
    #include linux/slab.h
    #include linux/kernel.h
   +#include linux/memcontrol.h
    #include linux/sched.h
    #include linux/module.h
  
   @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn 
   *fc, struct fuse_req *req)
  
          list_del(req-writepages_entry);
          dec_bdi_stat(bdi, BDI_WRITEBACK);
   +       mem_cgroup_update_stat(req-pages[0],
   +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
          dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
          bdi_writeout_inc(bdi);
          wake_up(fi-page_waitq);
   @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
          req-inode = inode;
  
          inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
   +       mem_cgroup_update_stat(tmp_page,
   +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
          inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
          end_page_writeback(page);
  
   diff --git a/fs/nfs/write.c b/fs/nfs/write.c
   index b753242..7316f7a 100644
   --- a/fs/nfs/write.c
   +++ b/fs/nfs/write.c
   @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
                          req-wb_index,
                          NFS_PAGE_TAG_COMMIT);
          spin_unlock(inode-i_lock);
   +       mem_cgroup_update_stat(req-wb_page, 
   MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
          inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
          inc_bdi_stat(req-wb_page-mapping-backing_dev_info, 
   BDI_UNSTABLE);
          __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
   @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
          struct page *page = req-wb_page;
  
          if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
   +               mem_cgroup_update_stat(page, 
   MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
                  dec_zone_page_state(page, NR_UNSTABLE_NFS);
                  dec_bdi_stat(page-mapping-backing_dev_info, 
   BDI_UNSTABLE);
                  return 1;
   @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct 
   list_head *head, int how)
                  req = nfs_list_entry(head-next);
                  nfs_list_remove_request(req);
                  nfs_mark_request_commit(req);
   +               mem_cgroup_update_stat(req-wb_page,
   +                               MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
                  dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
                  dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
                                  BDI_UNSTABLE);
   diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
   index ada2f1b..aef6d13 100644
   --- a/fs/nilfs2/segment.c
   +++ b/fs/nilfs2/segment.c
   @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page 
   *page, struct list_head *out)
          } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != 
   head);
          kunmap_atomic(kaddr, KM_USER0);
  
   -       if (!TestSetPageWriteback(clone_page))
   +       if (!TestSetPageWriteback(clone_page)) {
   +               mem_cgroup_update_stat(clone_page,
 
  s/clone_page/page/
 
  mmh... shouldn't we use the same page used by TestSetPageWriteback() and
  inc_zone_page_state()?
 
 Sorry, I've commented wrong hunk. It's for the next one.

Yes. Good catch! Will fix in the next version.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 02:48:56PM +0100, Peter Zijlstra wrote:
 On Mon, 2010-03-01 at 22:23 +0100, Andrea Righi wrote:
  Apply the cgroup dirty pages accounting and limiting infrastructure to
  the opportune kernel functions.
  
  Signed-off-by: Andrea Righi ari...@develer.com
  ---
 
  diff --git a/mm/page-writeback.c b/mm/page-writeback.c
  index 5a0f8f3..d83f41c 100644
  --- a/mm/page-writeback.c
  +++ b/mm/page-writeback.c
  @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties;
*/
   static int calc_period_shift(void)
   {
  -   unsigned long dirty_total;
  +   unsigned long dirty_total, dirty_bytes;
   
  -   if (vm_dirty_bytes)
  -   dirty_total = vm_dirty_bytes / PAGE_SIZE;
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
 
 So you don't think 0 is a valid max dirty amount?

A value of 0 means disabled. It's used to select between dirty_ratio
or dirty_bytes. It's the same for the gloabl vm_dirty_* parameters.

 
  +   dirty_total = dirty_bytes / PAGE_SIZE;
  else
  -   dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
  -   100;
  +   dirty_total = (mem_cgroup_dirty_ratio() *
  +   determine_dirtyable_memory()) / 100;
  return 2 + ilog2(dirty_total - 1);
   }
   
  @@ -408,14 +409,16 @@ static unsigned long 
  highmem_dirtyable_memory(unsigned long total)
*/
   unsigned long determine_dirtyable_memory(void)
   {
  -   unsigned long x;
  -
  -   x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
  +   unsigned long memory;
  +   s64 memcg_memory;
   
  +   memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
  if (!vm_highmem_is_dirtyable)
  -   x -= highmem_dirtyable_memory(x);
  -
  -   return x + 1;   /* Ensure that we never return 0 */
  +   memory -= highmem_dirtyable_memory(memory);
  +   memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
  +   if (memcg_memory  0)
 
 And here you somehow return negative?
 
  +   return memory + 1;
  +   return min((unsigned long)memcg_memory, memory + 1);
   }
   
   void
  @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned 
  long *pdirty,
   unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
   {
  unsigned long background;
  -   unsigned long dirty;
  +   unsigned long dirty, dirty_bytes, dirty_background;
  unsigned long available_memory = determine_dirtyable_memory();
  struct task_struct *tsk;
   
  -   if (vm_dirty_bytes)
  -   dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
  +   dirty_bytes = mem_cgroup_dirty_bytes();
  +   if (dirty_bytes)
 
 zero not valid again
 
  +   dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
  else {
  int dirty_ratio;
   
  -   dirty_ratio = vm_dirty_ratio;
  +   dirty_ratio = mem_cgroup_dirty_ratio();
  if (dirty_ratio  5)
  dirty_ratio = 5;
  dirty = (dirty_ratio * available_memory) / 100;
  }
   
  -   if (dirty_background_bytes)
  -   background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
  +   dirty_background = mem_cgroup_dirty_background_bytes();
  +   if (dirty_background)
 
 idem
 
  +   background = DIV_ROUND_UP(dirty_background, PAGE_SIZE);
  else
  -   background = (dirty_background_ratio * available_memory) / 100;
  -
  +   background = (mem_cgroup_dirty_background_ratio() *
  +   available_memory) / 100;
  if (background = dirty)
  background = dirty / 2;
  tsk = current;
  @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space 
  *mapping,
  get_dirty_limits(background_thresh, dirty_thresh,
  bdi_thresh, bdi);
   
  -   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
  +   nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
  +   nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
  +   if ((nr_reclaimable  0) || (nr_writeback  0)) {
  +   nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
  global_page_state(NR_UNSTABLE_NFS);
 
 ??? why would a page_state be negative.. I see you return -ENOMEM on !
 cgroup, but how can one specify no dirty limit with this compiled in?
 
  -   nr_writeback = global_page_state(NR_WRITEBACK);
  +   nr_writeback = global_page_state(NR_WRITEBACK);
  +   }
   
  bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
  if (bdi_cap_account_unstable(bdi)) {
  @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space 
  *mapping,
   * In normal mode, we start background writeout at the lower
   * background_thresh, to keep the amount of dirty memory low.
   */
  +   nr_reclaimable = 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote:
 * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]:
 
  On Tue, 2 Mar 2010 09:01:58 +0100
  Andrea Righi ari...@develer.com wrote:
  
   On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
On Mon,  1 Mar 2010 22:23:40 +0100
Andrea Righi ari...@develer.com wrote:

 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com

Seems nice.

Hmm. the last problem is moving account between memcg.

Right ?
   
   Correct. This was actually the last item of the TODO list. Anyway, I'm
   still considering if it's correct to move dirty pages when a task is
   migrated from a cgroup to another. Currently, dirty pages just remain in
   the original cgroup and are flushed depending on the original cgroup
   settings. That is not totally wrong... at least moving the dirty pages
   between memcgs should be optional (move_charge_at_immigrate?).
   
  
  My concern is 
   - migration between memcg is already suppoted
  - at task move
  - at rmdir
  
  Then, if you leave DIRTY_PAGE accounting to original cgroup,
  the new cgroup (migration target)'s Dirty page accounting may
  goes to be negative, or incorrect value. Please check FILE_MAPPED
  implementation in __mem_cgroup_move_account()
  
  As
 if (page_mapped(page)  !PageAnon(page)) {
  /* Update mapped_file data for mem_cgroup */
  preempt_disable();
  
  __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
  
  __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
  preempt_enable();
  }
  then, FILE_MAPPED never goes negative.
 
 
 Absolutely! I am not sure how complex dirty memory migration will be,
 but one way of working around it would be to disable migration of
 charges when the feature is enabled (dirty* is set in the memory
 cgroup). We might need additional logic to allow that to happen. 

I've started to look at dirty memory migration. First attempt is to add
DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in
__mem_cgroup_move_account(). Probably I'll have something ready for the
next version of the patch. I still need to figure if this can work as
expected...

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote:
 On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote:
  On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
@@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
  */
 dirty_thresh += dirty_thresh / 10;  /* wh... */
 
-if (global_page_state(NR_UNSTABLE_NFS) +
-   global_page_state(NR_WRITEBACK) = dirty_thresh)
-   break;
-congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+   dirty = 
mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
+   if (dirty  0)
+   dirty = global_page_state(NR_UNSTABLE_NFS) +
+   global_page_state(NR_WRITEBACK);
   
   dirty is unsigned long. As mentioned last time, above will never be true?
   In general these patches look ok to me. I will do some testing with these.
  
  Re-introduced the same bug. My bad. :(
  
  The value returned from mem_cgroup_page_stat() can be negative, i.e.
  when memory cgroup is disabled. We could simply use a long for dirty,
  the unit is in # of pages so s64 should be enough. Or cast dirty to long
  only for the check (see below).
  
  Thanks!
  -Andrea
  
  Signed-off-by: Andrea Righi ari...@develer.com
  ---
   mm/page-writeback.c |2 +-
   1 files changed, 1 insertions(+), 1 deletions(-)
  
  diff --git a/mm/page-writeback.c b/mm/page-writeback.c
  index d83f41c..dbee976 100644
  --- a/mm/page-writeback.c
  +++ b/mm/page-writeback.c
  @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
   
   
  dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
  -   if (dirty  0)
  +   if ((long)dirty  0)
 
 This will also be problematic as on 32bit systems, your uppper limit of
 dirty memory will be 2G?
 
 I guess, I will prefer one of the two.
 
 - return the error code from function and pass a pointer to store stats
   in as function argument.
 
 - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if
   per cgroup dirty control is enabled, then use per cgroup stats. In that
   case you don't have to return negative values.
 
   Only tricky part will be careful accouting so that none of the stats go
   negative in corner cases of migration etc.

What do you think about Peter's suggestion + the locking stuff? (see the
previous email). Otherwise, I'll choose the other solution, passing a
pointer and always return the error code is not bad.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Tue, Mar 02, 2010 at 06:59:32PM -0500, Vivek Goyal wrote:
 On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote:
  On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote:
   On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote:
On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
  @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
   dirty_thresh += dirty_thresh / 10;  /* 
  wh... */
   
  -if (global_page_state(NR_UNSTABLE_NFS) +
  -   global_page_state(NR_WRITEBACK) = dirty_thresh)
  -   break;
  -congestion_wait(BLK_RW_ASYNC, HZ/10);
  +
  +   dirty = 
  mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
  +   if (dirty  0)
  +   dirty = global_page_state(NR_UNSTABLE_NFS) +
  +   global_page_state(NR_WRITEBACK);
 
 dirty is unsigned long. As mentioned last time, above will never be 
 true?
 In general these patches look ok to me. I will do some testing with 
 these.

Re-introduced the same bug. My bad. :(

The value returned from mem_cgroup_page_stat() can be negative, i.e.
when memory cgroup is disabled. We could simply use a long for dirty,
the unit is in # of pages so s64 should be enough. Or cast dirty to long
only for the check (see below).

Thanks!
-Andrea

Signed-off-by: Andrea Righi ari...@develer.com
---
 mm/page-writeback.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d83f41c..dbee976 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
 
dirty = 
mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
-   if (dirty  0)
+   if ((long)dirty  0)
   
   This will also be problematic as on 32bit systems, your uppper limit of
   dirty memory will be 2G?
   
   I guess, I will prefer one of the two.
   
   - return the error code from function and pass a pointer to store stats
 in as function argument.
   
   - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if
 per cgroup dirty control is enabled, then use per cgroup stats. In that
 case you don't have to return negative values.
   
 Only tricky part will be careful accouting so that none of the stats go
 negative in corner cases of migration etc.
  
  What do you think about Peter's suggestion + the locking stuff? (see the
  previous email). Otherwise, I'll choose the other solution, passing a
  pointer and always return the error code is not bad.
  
 
 Ok, so you are worried about that by the we finish 
 mem_cgroup_has_dirty_limit()
 call, task might change cgroup and later we might call
 mem_cgroup_get_page_stat() on a different cgroup altogether which might or
 might not have dirty limits specified?

Correct.

 
 But in what cases you don't want to use memory cgroup specified limit? I
 thought cgroup disabled what the only case where we need to use global
 limits. Otherwise a memory cgroup will have either dirty_bytes specified
 or by default inherit global dirty_ratio which is a valid number. If
 that's the case then you don't have to take rcu_lock() outside
 get_page_stat()?
 
 IOW, apart from cgroup being disabled, what are the other cases where you
 expect to not use cgroup's page stat and use global stats?

At boot, when mem_cgroup_from_task() may return NULL. But this is not
related to the RCU acquisition.

Anyway, probably the RCU protection is not so critical for this
particular case, and we can simply get rid of it. In this way we can
easily implement the interface proposed by Peter.

-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Wed, Mar 03, 2010 at 08:21:07AM +0900, Daisuke Nishimura wrote:
 On Tue, 2 Mar 2010 23:18:23 +0100, Andrea Righi ari...@develer.com wrote:
  On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote:
   * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 
   17:23:16]:
   
On Tue, 2 Mar 2010 09:01:58 +0100
Andrea Righi ari...@develer.com wrote:

 On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
  On Mon,  1 Mar 2010 22:23:40 +0100
  Andrea Righi ari...@develer.com wrote:
  
   Apply the cgroup dirty pages accounting and limiting 
   infrastructure to
   the opportune kernel functions.
   
   Signed-off-by: Andrea Righi ari...@develer.com
  
  Seems nice.
  
  Hmm. the last problem is moving account between memcg.
  
  Right ?
 
 Correct. This was actually the last item of the TODO list. Anyway, I'm
 still considering if it's correct to move dirty pages when a task is
 migrated from a cgroup to another. Currently, dirty pages just remain 
 in
 the original cgroup and are flushed depending on the original cgroup
 settings. That is not totally wrong... at least moving the dirty pages
 between memcgs should be optional (move_charge_at_immigrate?).
 

My concern is 
 - migration between memcg is already suppoted
- at task move
- at rmdir

Then, if you leave DIRTY_PAGE accounting to original cgroup,
the new cgroup (migration target)'s Dirty page accounting may
goes to be negative, or incorrect value. Please check FILE_MAPPED
implementation in __mem_cgroup_move_account()

As
   if (page_mapped(page)  !PageAnon(page)) {
/* Update mapped_file data for mem_cgroup */
preempt_disable();

__this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);

__this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
then, FILE_MAPPED never goes negative.
   
   
   Absolutely! I am not sure how complex dirty memory migration will be,
   but one way of working around it would be to disable migration of
   charges when the feature is enabled (dirty* is set in the memory
   cgroup). We might need additional logic to allow that to happen. 
  
  I've started to look at dirty memory migration. First attempt is to add
  DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in
  __mem_cgroup_move_account(). Probably I'll have something ready for the
  next version of the patch. I still need to figure if this can work as
  expected...
  
 I agree it's a right direction(in fact, I have been planning to post a patch
 in that direction), so I leave it to you.
 Can you add PCG_FILE_MAPPED flag too ? I think this flag can be handled in the
 same way as other flags you're trying to add, and we can change
 if (page_mapped(page)  !PageAnon(page)) to if (PageCgroupFileMapped(pc)
 in __mem_cgroup_move_account(). It would be cleaner than current code, IMHO.

OK, sounds good to me. I'll introduce PCG_FILE_MAPPED in the next
version.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
 On Wed, 3 Mar 2010 15:15:49 +0900
 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote:
 
  Agreed.
  Let's try how we can write a code in clean way. (we have time ;)
  For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
  over killing. What I really want is lockless code...but it seems impossible
  under current implementation.
  
  I wonder the fact the page is never unchareged under us can give us some 
  chances
  ...Hmm.
  
 
 How about this ? Basically, I don't like duplicating information...so,
 # of new pcg_flags may be able to be reduced.
 
 I'm glad this can be a hint for Andrea-san.

Many thanks! I already wrote pretty the same code, but at this point I
think I'll just apply and test this one. ;)

-Andrea

 
 ==
 ---
  include/linux/page_cgroup.h |   44 -
  mm/memcontrol.c |   91 
 +++-
  2 files changed, 132 insertions(+), 3 deletions(-)
 
 Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
 ===
 --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
 +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
 @@ -39,6 +39,11 @@ enum {
   PCG_CACHE, /* charged as cache */
   PCG_USED, /* this object is in use. */
   PCG_ACCT_LRU, /* page has been accounted for */
 + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
 + PCG_ACCT_DIRTY,
 + PCG_ACCT_WB,
 + PCG_ACCT_WB_TEMP,
 + PCG_ACCT_UNSTABLE,
  };
  
  #define TESTPCGFLAG(uname, lname)\
 @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
  TESTPCGFLAG(AcctLRU, ACCT_LRU)
  TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
  
 +SETPCGFLAG(AcctDirty, ACCT_DIRTY);
 +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
 +TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
 +
 +SETPCGFLAG(AcctWB, ACCT_WB);
 +CLEARPCGFLAG(AcctWB, ACCT_WB);
 +TESTPCGFLAG(AcctWB, ACCT_WB);
 +
 +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +
 +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +
 +
  static inline int page_cgroup_nid(struct page_cgroup *pc)
  {
   return page_to_nid(pc-page);
 @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
  {
   return page_zonenum(pc-page);
  }
 -
 +/*
 + * lock_page_cgroup() should not be held under mapping-tree_lock
 + */
  static inline void lock_page_cgroup(struct page_cgroup *pc)
  {
   bit_spin_lock(PCG_LOCK, pc-flags);
 @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
   bit_spin_unlock(PCG_LOCK, pc-flags);
  }
  
 +/*
 + * Lock order is
 + *   lock_page_cgroup()
 + *   lock_page_cgroup_migrate()
 + * This lock is not be lock for charge/uncharge but for account moving.
 + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself
 + * the page is uncharged while we hold this.
 + */
 +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
 +{
 + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags);
 +}
 +
 +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
 +{
 + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags);
 +}
 +
  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
  struct page_cgroup;
  
 Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
 ===
 --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
 +++ mmotm-2.6.33-Mar2/mm/memcontrol.c
 @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
   MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
 + MEM_CGROUP_STAT_DIRTY,
 + MEM_CGROUP_STAT_WBACK,
 + MEM_CGROUP_STAT_WBACK_TEMP,
 + MEM_CGROUP_STAT_UNSTABLE_NFS,
  
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -1360,6 +1364,86 @@ done:
  }
  
  /*
 + * Update file cache's status for memcg. Before calling this,
 + * mapping-tree_lock should be held and preemption is disabled.
 + * Then, it's guarnteed that the page is not uncharged while we
 + * access page_cgroup. We can make use of that.
 + */
 +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem;
 +
 + pc = lookup_page_cgroup(page);
 + /* Not accounted ? */
 + if (!PageCgroupUsed(pc))
 + return;
 + lock_page_cgroup_migrate(pc);
 + /*
 +  * It's guarnteed that this page is never uncharged.
 +  * The only racy problem is moving account among memcgs.
 +  */
 + switch (idx) {
 + case MEM_CGROUP_STAT_DIRTY:
 + if (set)
 + SetPageCgroupAcctDirty(pc);
 + else
 + ClearPageCgroupAcctDirty(pc);
 + 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Wed, Mar 03, 2010 at 12:47:03PM +0100, Andrea Righi wrote:
 On Tue, Mar 02, 2010 at 06:59:32PM -0500, Vivek Goyal wrote:
  On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote:
   On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote:
On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote:
 On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
   @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 */
dirty_thresh += dirty_thresh / 10;  /* 
   wh... */

   -if (global_page_state(NR_UNSTABLE_NFS) +
   - global_page_state(NR_WRITEBACK) = dirty_thresh)
   - break;
   -congestion_wait(BLK_RW_ASYNC, HZ/10);
   +
   + dirty = 
   mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
   + if (dirty  0)
   + dirty = global_page_state(NR_UNSTABLE_NFS) +
   + global_page_state(NR_WRITEBACK);
  
  dirty is unsigned long. As mentioned last time, above will never be 
  true?
  In general these patches look ok to me. I will do some testing with 
  these.
 
 Re-introduced the same bug. My bad. :(
 
 The value returned from mem_cgroup_page_stat() can be negative, i.e.
 when memory cgroup is disabled. We could simply use a long for dirty,
 the unit is in # of pages so s64 should be enough. Or cast dirty to 
 long
 only for the check (see below).
 
 Thanks!
 -Andrea
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  mm/page-writeback.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index d83f41c..dbee976 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
  
  
   dirty = 
 mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
 - if (dirty  0)
 + if ((long)dirty  0)

This will also be problematic as on 32bit systems, your uppper limit of
dirty memory will be 2G?

I guess, I will prefer one of the two.

- return the error code from function and pass a pointer to store stats
  in as function argument.

- Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if
  per cgroup dirty control is enabled, then use per cgroup stats. In 
that
  case you don't have to return negative values.

  Only tricky part will be careful accouting so that none of the stats 
go
  negative in corner cases of migration etc.
   
   What do you think about Peter's suggestion + the locking stuff? (see the
   previous email). Otherwise, I'll choose the other solution, passing a
   pointer and always return the error code is not bad.
   
  
  Ok, so you are worried about that by the we finish 
  mem_cgroup_has_dirty_limit()
  call, task might change cgroup and later we might call
  mem_cgroup_get_page_stat() on a different cgroup altogether which might or
  might not have dirty limits specified?
 
 Correct.
 
  
  But in what cases you don't want to use memory cgroup specified limit? I
  thought cgroup disabled what the only case where we need to use global
  limits. Otherwise a memory cgroup will have either dirty_bytes specified
  or by default inherit global dirty_ratio which is a valid number. If
  that's the case then you don't have to take rcu_lock() outside
  get_page_stat()?
  
  IOW, apart from cgroup being disabled, what are the other cases where you
  expect to not use cgroup's page stat and use global stats?
 
 At boot, when mem_cgroup_from_task() may return NULL. But this is not
 related to the RCU acquisition.

Nevermind. You're right. In any case even if a task is migrated to a
different cgroup it will always have mem_cgroup_has_dirty_limit() ==
true.

So RCU protection is not needed outside these functions.

OK, I'll go with the Peter's suggestion.

Thanks!
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Wed, Mar 03, 2010 at 11:07:35AM +0100, Peter Zijlstra wrote:
 On Tue, 2010-03-02 at 23:14 +0100, Andrea Righi wrote:
  
  I agree mem_cgroup_has_dirty_limit() is nicer. But we must do that under
  RCU, so something like:
  
  rcu_read_lock();
  if (mem_cgroup_has_dirty_limit())
  mem_cgroup_get_page_stat()
  else
  global_page_state()
  rcu_read_unlock();
  
  That is bad when mem_cgroup_has_dirty_limit() always returns false
  (e.g., when memory cgroups are disabled). So I fallback to the old
  interface.
 
 Why is it that mem_cgroup_has_dirty_limit() needs RCU when
 mem_cgroup_get_page_stat() doesn't? That is, simply make
 mem_cgroup_has_dirty_limit() not require RCU in the same way
 *_get_page_stat() doesn't either.

OK, I agree we can get rid of RCU protection here (see my previous
email).

BTW the point was that after mem_cgroup_has_dirty_limit() the task might
be moved to another cgroup, but also in this case mem_cgroup_has_dirty_limit()
will be always true, so mem_cgroup_get_page_stat() is always coherent.

 
  What do you think about:
  
  mem_cgroup_lock();
  if (mem_cgroup_has_dirty_limit())
  mem_cgroup_get_page_stat()
  else
  global_page_state()
  mem_cgroup_unlock();
  
  Where mem_cgroup_read_lock/unlock() simply expand to nothing when
  memory cgroups are disabled.
 
 I think you're engineering the wrong way around.
 
   
   That allows for a 0 dirty limit (which should work and basically makes
   all io synchronous).
  
  IMHO it is better to reserve 0 for the special value disabled like the
  global settings. A synchronous IO can be also achieved using a dirty
  limit of 1.
 
 Why?! 0 clearly states no writeback cache, IOW sync writes, a 1
 byte/page writeback cache effectively reduces to the same thing, but its
 not the same thing conceptually. If you want to put the size and enable
 into a single variable pick -1 for disable or so.

I might agree, and actually I prefer this solution.. but in this way we
would use a different interface respect to the equivalent vm_dirty_ratio
/ vm_dirty_bytes global settings (as well as dirty_background_ratio /
dirty_background_bytes).

IMHO it's better to use the same interface to avoid user
misunderstandings.

Thanks,
-Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-30 Thread Andrea Righi
On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
 On Wed, 3 Mar 2010 15:15:49 +0900
 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote:
 
  Agreed.
  Let's try how we can write a code in clean way. (we have time ;)
  For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
  over killing. What I really want is lockless code...but it seems impossible
  under current implementation.
  
  I wonder the fact the page is never unchareged under us can give us some 
  chances
  ...Hmm.
  
 
 How about this ? Basically, I don't like duplicating information...so,
 # of new pcg_flags may be able to be reduced.
 
 I'm glad this can be a hint for Andrea-san.
 
 ==
 ---
  include/linux/page_cgroup.h |   44 -
  mm/memcontrol.c |   91 
 +++-
  2 files changed, 132 insertions(+), 3 deletions(-)
 
 Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
 ===
 --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
 +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
 @@ -39,6 +39,11 @@ enum {
   PCG_CACHE, /* charged as cache */
   PCG_USED, /* this object is in use. */
   PCG_ACCT_LRU, /* page has been accounted for */
 + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
 + PCG_ACCT_DIRTY,
 + PCG_ACCT_WB,
 + PCG_ACCT_WB_TEMP,
 + PCG_ACCT_UNSTABLE,
  };
  
  #define TESTPCGFLAG(uname, lname)\
 @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
  TESTPCGFLAG(AcctLRU, ACCT_LRU)
  TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
  
 +SETPCGFLAG(AcctDirty, ACCT_DIRTY);
 +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
 +TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
 +
 +SETPCGFLAG(AcctWB, ACCT_WB);
 +CLEARPCGFLAG(AcctWB, ACCT_WB);
 +TESTPCGFLAG(AcctWB, ACCT_WB);
 +
 +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
 +
 +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
 +
 +
  static inline int page_cgroup_nid(struct page_cgroup *pc)
  {
   return page_to_nid(pc-page);
 @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
  {
   return page_zonenum(pc-page);
  }
 -
 +/*
 + * lock_page_cgroup() should not be held under mapping-tree_lock
 + */
  static inline void lock_page_cgroup(struct page_cgroup *pc)
  {
   bit_spin_lock(PCG_LOCK, pc-flags);
 @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
   bit_spin_unlock(PCG_LOCK, pc-flags);
  }
  
 +/*
 + * Lock order is
 + *   lock_page_cgroup()
 + *   lock_page_cgroup_migrate()
 + * This lock is not be lock for charge/uncharge but for account moving.
 + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself
 + * the page is uncharged while we hold this.
 + */
 +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
 +{
 + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags);
 +}
 +
 +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
 +{
 + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags);
 +}
 +
  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
  struct page_cgroup;
  
 Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
 ===
 --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
 +++ mmotm-2.6.33-Mar2/mm/memcontrol.c
 @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
   MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
   MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
   MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
 + MEM_CGROUP_STAT_DIRTY,
 + MEM_CGROUP_STAT_WBACK,
 + MEM_CGROUP_STAT_WBACK_TEMP,
 + MEM_CGROUP_STAT_UNSTABLE_NFS,
  
   MEM_CGROUP_STAT_NSTATS,
  };
 @@ -1360,6 +1364,86 @@ done:
  }
  
  /*
 + * Update file cache's status for memcg. Before calling this,
 + * mapping-tree_lock should be held and preemption is disabled.
 + * Then, it's guarnteed that the page is not uncharged while we
 + * access page_cgroup. We can make use of that.
 + */
 +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
 +{
 + struct page_cgroup *pc;
 + struct mem_cgroup *mem;
 +
 + pc = lookup_page_cgroup(page);
 + /* Not accounted ? */
 + if (!PageCgroupUsed(pc))
 + return;
 + lock_page_cgroup_migrate(pc);
 + /*
 +  * It's guarnteed that this page is never uncharged.
 +  * The only racy problem is moving account among memcgs.
 +  */
 + switch (idx) {
 + case MEM_CGROUP_STAT_DIRTY:
 + if (set)
 + SetPageCgroupAcctDirty(pc);
 + else
 + ClearPageCgroupAcctDirty(pc);
 + break;
 + case MEM_CGROUP_STAT_WBACK:
 + if (set)
 + SetPageCgroupAcctWB(pc);
 + 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-03 Thread KAMEZAWA Hiroyuki
On Wed, 3 Mar 2010 15:15:49 +0900
KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote:

 Agreed.
 Let's try how we can write a code in clean way. (we have time ;)
 For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
 over killing. What I really want is lockless code...but it seems impossible
 under current implementation.
 
 I wonder the fact the page is never unchareged under us can give us some 
 chances
 ...Hmm.
 

How about this ? Basically, I don't like duplicating information...so,
# of new pcg_flags may be able to be reduced.

I'm glad this can be a hint for Andrea-san.

==
---
 include/linux/page_cgroup.h |   44 -
 mm/memcontrol.c |   91 +++-
 2 files changed, 132 insertions(+), 3 deletions(-)

Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
===
--- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
@@ -39,6 +39,11 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
+   PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
+   PCG_ACCT_DIRTY,
+   PCG_ACCT_WB,
+   PCG_ACCT_WB_TEMP,
+   PCG_ACCT_UNSTABLE,
 };
 
 #define TESTPCGFLAG(uname, lname)  \
@@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
 TESTPCGFLAG(AcctLRU, ACCT_LRU)
 TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
+SETPCGFLAG(AcctDirty, ACCT_DIRTY);
+CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
+TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
+
+SETPCGFLAG(AcctWB, ACCT_WB);
+CLEARPCGFLAG(AcctWB, ACCT_WB);
+TESTPCGFLAG(AcctWB, ACCT_WB);
+
+SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
+
+SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
+
+
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
return page_to_nid(pc-page);
@@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
 {
return page_zonenum(pc-page);
 }
-
+/*
+ * lock_page_cgroup() should not be held under mapping-tree_lock
+ */
 static inline void lock_page_cgroup(struct page_cgroup *pc)
 {
bit_spin_lock(PCG_LOCK, pc-flags);
@@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, pc-flags);
 }
 
+/*
+ * Lock order is
+ * lock_page_cgroup()
+ * lock_page_cgroup_migrate()
+ * This lock is not be lock for charge/uncharge but for account moving.
+ * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself
+ * the page is uncharged while we hold this.
+ */
+static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+   bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags);
+}
+
+static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
+{
+   bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
===
--- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Mar2/mm/memcontrol.c
@@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
+   MEM_CGROUP_STAT_DIRTY,
+   MEM_CGROUP_STAT_WBACK,
+   MEM_CGROUP_STAT_WBACK_TEMP,
+   MEM_CGROUP_STAT_UNSTABLE_NFS,
 
MEM_CGROUP_STAT_NSTATS,
 };
@@ -1360,6 +1364,86 @@ done:
 }
 
 /*
+ * Update file cache's status for memcg. Before calling this,
+ * mapping-tree_lock should be held and preemption is disabled.
+ * Then, it's guarnteed that the page is not uncharged while we
+ * access page_cgroup. We can make use of that.
+ */
+void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
+{
+   struct page_cgroup *pc;
+   struct mem_cgroup *mem;
+
+   pc = lookup_page_cgroup(page);
+   /* Not accounted ? */
+   if (!PageCgroupUsed(pc))
+   return;
+   lock_page_cgroup_migrate(pc);
+   /*
+* It's guarnteed that this page is never uncharged.
+* The only racy problem is moving account among memcgs.
+*/
+   switch (idx) {
+   case MEM_CGROUP_STAT_DIRTY:
+   if (set)
+   SetPageCgroupAcctDirty(pc);
+   else
+   ClearPageCgroupAcctDirty(pc);
+   break;
+   case MEM_CGROUP_STAT_WBACK:
+   if (set)
+   SetPageCgroupAcctWB(pc);
+   else
+   ClearPageCgroupAcctWB(pc);
+   break;
+   case MEM_CGROUP_STAT_WBACK_TEMP:
+

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-03 Thread Daisuke Nishimura
On Wed, 3 Mar 2010 23:03:19 +0100, Andrea Righi ari...@develer.com wrote:
 On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
  On Wed, 3 Mar 2010 15:15:49 +0900
  KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote:
  
   Agreed.
   Let's try how we can write a code in clean way. (we have time ;)
   For now, to me, IRQ disabling while lock_page_cgroup() seems to be a 
   little
   over killing. What I really want is lockless code...but it seems 
   impossible
   under current implementation.
   
   I wonder the fact the page is never unchareged under us can give us 
   some chances
   ...Hmm.
   
  
  How about this ? Basically, I don't like duplicating information...so,
  # of new pcg_flags may be able to be reduced.
  
  I'm glad this can be a hint for Andrea-san.
  
  ==
  ---
   include/linux/page_cgroup.h |   44 -
   mm/memcontrol.c |   91 
  +++-
   2 files changed, 132 insertions(+), 3 deletions(-)
  
  Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
  ===
  --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h
  +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h
  @@ -39,6 +39,11 @@ enum {
  PCG_CACHE, /* charged as cache */
  PCG_USED, /* this object is in use. */
  PCG_ACCT_LRU, /* page has been accounted for */
  +   PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */
  +   PCG_ACCT_DIRTY,
  +   PCG_ACCT_WB,
  +   PCG_ACCT_WB_TEMP,
  +   PCG_ACCT_UNSTABLE,
   };
   
   #define TESTPCGFLAG(uname, lname)  \
  @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
   TESTPCGFLAG(AcctLRU, ACCT_LRU)
   TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
   
  +SETPCGFLAG(AcctDirty, ACCT_DIRTY);
  +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY);
  +TESTPCGFLAG(AcctDirty, ACCT_DIRTY);
  +
  +SETPCGFLAG(AcctWB, ACCT_WB);
  +CLEARPCGFLAG(AcctWB, ACCT_WB);
  +TESTPCGFLAG(AcctWB, ACCT_WB);
  +
  +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
  +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
  +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP);
  +
  +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
  +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
  +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE);
  +
  +
   static inline int page_cgroup_nid(struct page_cgroup *pc)
   {
  return page_to_nid(pc-page);
  @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup
   {
  return page_zonenum(pc-page);
   }
  -
  +/*
  + * lock_page_cgroup() should not be held under mapping-tree_lock
  + */
   static inline void lock_page_cgroup(struct page_cgroup *pc)
   {
  bit_spin_lock(PCG_LOCK, pc-flags);
  @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st
  bit_spin_unlock(PCG_LOCK, pc-flags);
   }
   
  +/*
  + * Lock order is
  + * lock_page_cgroup()
  + * lock_page_cgroup_migrate()
  + * This lock is not be lock for charge/uncharge but for account moving.
  + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself
  + * the page is uncharged while we hold this.
  + */
  +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc)
  +{
  +   bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags);
  +}
  +
  +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc)
  +{
  +   bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags);
  +}
  +
   #else /* CONFIG_CGROUP_MEM_RES_CTLR */
   struct page_cgroup;
   
  Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
  ===
  --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
  +++ mmotm-2.6.33-Mar2/mm/memcontrol.c
  @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index {
  MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  MEM_CGROUP_EVENTS,  /* incremented at every  pagein/pageout */
  +   MEM_CGROUP_STAT_DIRTY,
  +   MEM_CGROUP_STAT_WBACK,
  +   MEM_CGROUP_STAT_WBACK_TEMP,
  +   MEM_CGROUP_STAT_UNSTABLE_NFS,
   
  MEM_CGROUP_STAT_NSTATS,
   };
  @@ -1360,6 +1364,86 @@ done:
   }
   
   /*
  + * Update file cache's status for memcg. Before calling this,
  + * mapping-tree_lock should be held and preemption is disabled.
  + * Then, it's guarnteed that the page is not uncharged while we
  + * access page_cgroup. We can make use of that.
  + */
  +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set)
  +{
  +   struct page_cgroup *pc;
  +   struct mem_cgroup *mem;
  +
  +   pc = lookup_page_cgroup(page);
  +   /* Not accounted ? */
  +   if (!PageCgroupUsed(pc))
  +   return;
  +   lock_page_cgroup_migrate(pc);
  +   /*
  +* It's guarnteed that this page is never uncharged.
  +* The only racy problem is moving account among memcgs.
  +*/
  +   switch (idx) {
  +   case MEM_CGROUP_STAT_DIRTY:
  +   if (set)
  +   SetPageCgroupAcctDirty(pc);
  +   else
  +   

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-03 Thread KAMEZAWA Hiroyuki
On Wed, 3 Mar 2010 23:03:19 +0100
Andrea Righi ari...@develer.com wrote:

 On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote:
  On Wed, 3 Mar 2010 15:15:49 +0900
  KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote:
 
  +   preempt_disable();
  +   lock_page_cgroup_migrate(pc);
  page = pc-page;
  if (page_mapped(page)  !PageAnon(page)) {
  /* Update mapped_file data for mem_cgroup */
  -   preempt_disable();
  __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
  __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
  -   preempt_enable();
  }
  mem_cgroup_charge_statistics(from, pc, false);
  +   move_acct_information(from, to, pc);
 
 Kame-san, a question. According to is_target_pte_for_mc() it seems we
 don't move file pages across cgroups for now. 

yes. It's just in plan.

 If !PageAnon(page) we just return 0 and the page won't be selected for 
 migration in
 mem_cgroup_move_charge_pte_range().
 
 So, if I've understood well the code is correct in perspective, but
 right now it's unnecessary. File pages are not moved on task migration
 across cgroups and, at the moment, there's no way for file page
 accounted statistics to go negative.
 
 Or am I missing something?
 

At rmdir(), remainging file caches in a cgroup is moved to
its parent. Then, all file caches are moved to its parent at rmdir().

This behavior is for avoiding to lose too much file caches at removing cgroup.

Thanks,
-Kame


___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Daisuke Nishimura
On Tue, 2 Mar 2010 09:01:58 +0100, Andrea Righi ari...@develer.com wrote:
 On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
  On Mon,  1 Mar 2010 22:23:40 +0100
  Andrea Righi ari...@develer.com wrote:
  
   Apply the cgroup dirty pages accounting and limiting infrastructure to
   the opportune kernel functions.
   
   Signed-off-by: Andrea Righi ari...@develer.com
  
  Seems nice.
  
  Hmm. the last problem is moving account between memcg.
  
  Right ?
 
 Correct. This was actually the last item of the TODO list. Anyway, I'm
 still considering if it's correct to move dirty pages when a task is
 migrated from a cgroup to another. Currently, dirty pages just remain in
 the original cgroup and are flushed depending on the original cgroup
 settings. That is not totally wrong... at least moving the dirty pages
 between memcgs should be optional (move_charge_at_immigrate?).
 
FYI, I'm planning to add file-cache and shmem/tmpfs support for move_charge 
feature
for 2.6.35.
But, hmm, it would be complicated if we try to move dirty account too.


Thanks,
Daisuke Nishimura.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread KAMEZAWA Hiroyuki
On Tue, 2 Mar 2010 09:01:58 +0100
Andrea Righi ari...@develer.com wrote:

 On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
  On Mon,  1 Mar 2010 22:23:40 +0100
  Andrea Righi ari...@develer.com wrote:
  
   Apply the cgroup dirty pages accounting and limiting infrastructure to
   the opportune kernel functions.
   
   Signed-off-by: Andrea Righi ari...@develer.com
  
  Seems nice.
  
  Hmm. the last problem is moving account between memcg.
  
  Right ?
 
 Correct. This was actually the last item of the TODO list. Anyway, I'm
 still considering if it's correct to move dirty pages when a task is
 migrated from a cgroup to another. Currently, dirty pages just remain in
 the original cgroup and are flushed depending on the original cgroup
 settings. That is not totally wrong... at least moving the dirty pages
 between memcgs should be optional (move_charge_at_immigrate?).
 

My concern is 
 - migration between memcg is already suppoted
- at task move
- at rmdir

Then, if you leave DIRTY_PAGE accounting to original cgroup,
the new cgroup (migration target)'s Dirty page accounting may
goes to be negative, or incorrect value. Please check FILE_MAPPED
implementation in __mem_cgroup_move_account()

As
   if (page_mapped(page)  !PageAnon(page)) {
/* Update mapped_file data for mem_cgroup */
preempt_disable();
__this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
__this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
then, FILE_MAPPED never goes negative.


Thanks,
-Kame

 Thanks for your ack and the detailed review!
 
 -Andrea
 
 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a
 

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Kirill A. Shutemov
On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote:
 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.

 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c      |    5 +++
  fs/nfs/write.c      |    4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c        |    1 +
  mm/page-writeback.c |   84 --
  mm/rmap.c           |    4 +-
  mm/truncate.c       |    2 +
  7 files changed, 76 insertions(+), 34 deletions(-)

 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h

 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)

        list_del(req-writepages_entry);
        dec_bdi_stat(bdi, BDI_WRITEBACK);
 +       mem_cgroup_update_stat(req-pages[0],
 +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
        dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
        bdi_writeout_inc(bdi);
        wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
        req-inode = inode;

        inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 +       mem_cgroup_update_stat(tmp_page,
 +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        end_page_writeback(page);

 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
                        req-wb_index,
                        NFS_PAGE_TAG_COMMIT);
        spin_unlock(inode-i_lock);
 +       mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
        inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
        inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
        struct page *page = req-wb_page;

        if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 +               mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, 
 -1);
                dec_zone_page_state(page, NR_UNSTABLE_NFS);
                dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
                return 1;
 @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
                req = nfs_list_entry(head-next);
                nfs_list_remove_request(req);
                nfs_mark_request_commit(req);
 +               mem_cgroup_update_stat(req-wb_page,
 +                               MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
                dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
                dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
                                BDI_UNSTABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index ada2f1b..aef6d13 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
        } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
        kunmap_atomic(kaddr, KM_USER0);

 -       if (!TestSetPageWriteback(clone_page))
 +       if (!TestSetPageWriteback(clone_page)) {
 +               mem_cgroup_update_stat(clone_page,

s/clone_page/page/

And #include linux/memcontrol.h is missed.

 +                               MEM_CGROUP_STAT_WRITEBACK, 1);
                inc_zone_page_state(clone_page, NR_WRITEBACK);
 +       }
        unlock_page(clone_page);

        return 0;
 @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int 
 err)
        }

        if (buffer_nilfs_allocated(page_buffers(page))) {
 -               if (TestClearPageWriteback(page))
 +               if (TestClearPageWriteback(page)) {
 +                       mem_cgroup_update_stat(clone_page,
 +                                       MEM_CGROUP_STAT_WRITEBACK, -1);
                        dec_zone_page_state(page, NR_WRITEBACK);
 +               }
        } else
                end_page_writeback(page);
  }
 diff --git a/mm/filemap.c b/mm/filemap.c
 index fe09e51..f85acae 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
         * having removed the page entirely.
         */
        if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 +               mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
                dec_zone_page_state(page, NR_FILE_DIRTY);
                dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
        }
 diff --git 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Kirill A. Shutemov
On Tue, Mar 2, 2010 at 1:02 PM, Andrea Righi ari...@develer.com wrote:
 On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote:
 On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote:
  Apply the cgroup dirty pages accounting and limiting infrastructure to
  the opportune kernel functions.
 
  Signed-off-by: Andrea Righi ari...@develer.com
  ---
   fs/fuse/file.c      |    5 +++
   fs/nfs/write.c      |    4 ++
   fs/nilfs2/segment.c |   10 +-
   mm/filemap.c        |    1 +
   mm/page-writeback.c |   84 
  --
   mm/rmap.c           |    4 +-
   mm/truncate.c       |    2 +
   7 files changed, 76 insertions(+), 34 deletions(-)
 
  diff --git a/fs/fuse/file.c b/fs/fuse/file.c
  index a9f5e13..dbbdd53 100644
  --- a/fs/fuse/file.c
  +++ b/fs/fuse/file.c
  @@ -11,6 +11,7 @@
   #include linux/pagemap.h
   #include linux/slab.h
   #include linux/kernel.h
  +#include linux/memcontrol.h
   #include linux/sched.h
   #include linux/module.h
 
  @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn 
  *fc, struct fuse_req *req)
 
         list_del(req-writepages_entry);
         dec_bdi_stat(bdi, BDI_WRITEBACK);
  +       mem_cgroup_update_stat(req-pages[0],
  +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
         dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
         bdi_writeout_inc(bdi);
         wake_up(fi-page_waitq);
  @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
         req-inode = inode;
 
         inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
  +       mem_cgroup_update_stat(tmp_page,
  +                       MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
         end_page_writeback(page);
 
  diff --git a/fs/nfs/write.c b/fs/nfs/write.c
  index b753242..7316f7a 100644
  --- a/fs/nfs/write.c
  +++ b/fs/nfs/write.c
  @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
                         req-wb_index,
                         NFS_PAGE_TAG_COMMIT);
         spin_unlock(inode-i_lock);
  +       mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 
  1);
         inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
         inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
  @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
         struct page *page = req-wb_page;
 
         if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
  +               mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, 
  -1);
                 dec_zone_page_state(page, NR_UNSTABLE_NFS);
                 dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
                 return 1;
  @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct 
  list_head *head, int how)
                 req = nfs_list_entry(head-next);
                 nfs_list_remove_request(req);
                 nfs_mark_request_commit(req);
  +               mem_cgroup_update_stat(req-wb_page,
  +                               MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
                 dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
                 dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
                                 BDI_UNSTABLE);
  diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
  index ada2f1b..aef6d13 100644
  --- a/fs/nilfs2/segment.c
  +++ b/fs/nilfs2/segment.c
  @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
  struct list_head *out)
         } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
         kunmap_atomic(kaddr, KM_USER0);
 
  -       if (!TestSetPageWriteback(clone_page))
  +       if (!TestSetPageWriteback(clone_page)) {
  +               mem_cgroup_update_stat(clone_page,

 s/clone_page/page/

 mmh... shouldn't we use the same page used by TestSetPageWriteback() and
 inc_zone_page_state()?

Sorry, I've commented wrong hunk. It's for the next one.


 And #include linux/memcontrol.h is missed.

 OK.

 I'll apply your fixes and post a new version.

 Thanks for reviewing,
 -Andrea

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Balbir Singh
* Andrea Righi ari...@develer.com [2010-03-01 22:23:40]:

 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c  |5 +++
  fs/nfs/write.c  |4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c|1 +
  mm/page-writeback.c |   84 --
  mm/rmap.c   |4 +-
  mm/truncate.c   |2 +
  7 files changed, 76 insertions(+), 34 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
 
 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
 
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_update_stat(req-pages[0],
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
 
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_update_stat(tmp_page,
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
 
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c

Don't need memcontrol.h to be included here?

Looks OK to me overall, but there might be objection using the
mem_cgroup_* naming convention, but I don't mind it very much :)

-- 
Three Cheers,
Balbir
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Peter Zijlstra
On Mon, 2010-03-01 at 22:23 +0100, Andrea Righi wrote:
 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---

 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 5a0f8f3..d83f41c 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties;
   */
  static int calc_period_shift(void)
  {
 - unsigned long dirty_total;
 + unsigned long dirty_total, dirty_bytes;
  
 - if (vm_dirty_bytes)
 - dirty_total = vm_dirty_bytes / PAGE_SIZE;
 + dirty_bytes = mem_cgroup_dirty_bytes();
 + if (dirty_bytes)

So you don't think 0 is a valid max dirty amount?

 + dirty_total = dirty_bytes / PAGE_SIZE;
   else
 - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
 - 100;
 + dirty_total = (mem_cgroup_dirty_ratio() *
 + determine_dirtyable_memory()) / 100;
   return 2 + ilog2(dirty_total - 1);
  }
  
 @@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned 
 long total)
   */
  unsigned long determine_dirtyable_memory(void)
  {
 - unsigned long x;
 -
 - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
 + unsigned long memory;
 + s64 memcg_memory;
  
 + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   if (!vm_highmem_is_dirtyable)
 - x -= highmem_dirtyable_memory(x);
 -
 - return x + 1;   /* Ensure that we never return 0 */
 + memory -= highmem_dirtyable_memory(memory);
 + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
 + if (memcg_memory  0)

And here you somehow return negative?

 + return memory + 1;
 + return min((unsigned long)memcg_memory, memory + 1);
  }
  
  void
 @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned 
 long *pdirty,
unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
  {
   unsigned long background;
 - unsigned long dirty;
 + unsigned long dirty, dirty_bytes, dirty_background;
   unsigned long available_memory = determine_dirtyable_memory();
   struct task_struct *tsk;
  
 - if (vm_dirty_bytes)
 - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
 + dirty_bytes = mem_cgroup_dirty_bytes();
 + if (dirty_bytes)

zero not valid again

 + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
   else {
   int dirty_ratio;
  
 - dirty_ratio = vm_dirty_ratio;
 + dirty_ratio = mem_cgroup_dirty_ratio();
   if (dirty_ratio  5)
   dirty_ratio = 5;
   dirty = (dirty_ratio * available_memory) / 100;
   }
  
 - if (dirty_background_bytes)
 - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
 + dirty_background = mem_cgroup_dirty_background_bytes();
 + if (dirty_background)

idem

 + background = DIV_ROUND_UP(dirty_background, PAGE_SIZE);
   else
 - background = (dirty_background_ratio * available_memory) / 100;
 -
 + background = (mem_cgroup_dirty_background_ratio() *
 + available_memory) / 100;
   if (background = dirty)
   background = dirty / 2;
   tsk = current;
 @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space 
 *mapping,
   get_dirty_limits(background_thresh, dirty_thresh,
   bdi_thresh, bdi);
  
 - nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
 + nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
 + if ((nr_reclaimable  0) || (nr_writeback  0)) {
 + nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
   global_page_state(NR_UNSTABLE_NFS);

??? why would a page_state be negative.. I see you return -ENOMEM on !
cgroup, but how can one specify no dirty limit with this compiled in?

 - nr_writeback = global_page_state(NR_WRITEBACK);
 + nr_writeback = global_page_state(NR_WRITEBACK);
 + }
  
   bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
   if (bdi_cap_account_unstable(bdi)) {
 @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space 
 *mapping,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
 + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
 + if (nr_reclaimable  0)
 + nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 + 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Balbir Singh
* KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]:

 On Tue, 2 Mar 2010 09:01:58 +0100
 Andrea Righi ari...@develer.com wrote:
 
  On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
   On Mon,  1 Mar 2010 22:23:40 +0100
   Andrea Righi ari...@develer.com wrote:
   
Apply the cgroup dirty pages accounting and limiting infrastructure to
the opportune kernel functions.

Signed-off-by: Andrea Righi ari...@develer.com
   
   Seems nice.
   
   Hmm. the last problem is moving account between memcg.
   
   Right ?
  
  Correct. This was actually the last item of the TODO list. Anyway, I'm
  still considering if it's correct to move dirty pages when a task is
  migrated from a cgroup to another. Currently, dirty pages just remain in
  the original cgroup and are flushed depending on the original cgroup
  settings. That is not totally wrong... at least moving the dirty pages
  between memcgs should be optional (move_charge_at_immigrate?).
  
 
 My concern is 
  - migration between memcg is already suppoted
 - at task move
 - at rmdir
 
 Then, if you leave DIRTY_PAGE accounting to original cgroup,
 the new cgroup (migration target)'s Dirty page accounting may
 goes to be negative, or incorrect value. Please check FILE_MAPPED
 implementation in __mem_cgroup_move_account()
 
 As
if (page_mapped(page)  !PageAnon(page)) {
 /* Update mapped_file data for mem_cgroup */
 preempt_disable();
 
 __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
 __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
 preempt_enable();
 }
 then, FILE_MAPPED never goes negative.


Absolutely! I am not sure how complex dirty memory migration will be,
but one way of working around it would be to disable migration of
charges when the feature is enabled (dirty* is set in the memory
cgroup). We might need additional logic to allow that to happen. 

-- 
Three Cheers,
Balbir
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Kirill A. Shutemov
On Tue, Mar 2, 2010 at 3:47 PM, Balbir Singh bal...@linux.vnet.ibm.com wrote:
 * Andrea Righi ari...@develer.com [2010-03-01 22:23:40]:

 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.

 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c      |    5 +++
  fs/nfs/write.c      |    4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c        |    1 +
  mm/page-writeback.c |   84 
 --
  mm/rmap.c           |    4 +-
  mm/truncate.c       |    2 +
  7 files changed, 76 insertions(+), 34 deletions(-)

 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h

 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn 
 *fc, struct fuse_req *req)

       list_del(req-writepages_entry);
       dec_bdi_stat(bdi, BDI_WRITEBACK);
 +     mem_cgroup_update_stat(req-pages[0],
 +                     MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
       dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
       bdi_writeout_inc(bdi);
       wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
       req-inode = inode;

       inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 +     mem_cgroup_update_stat(tmp_page,
 +                     MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
       inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
       end_page_writeback(page);

 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c

 Don't need memcontrol.h to be included here?

It's included in linux/swap.h

 Looks OK to me overall, but there might be objection using the
 mem_cgroup_* naming convention, but I don't mind it very much :)

 --
        Three Cheers,
        Balbir

 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Vivek Goyal
On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote:
 On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
   @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 */
dirty_thresh += dirty_thresh / 10;  /* wh... */

   -if (global_page_state(NR_UNSTABLE_NFS) +
   - global_page_state(NR_WRITEBACK) = dirty_thresh)
   - break;
   -congestion_wait(BLK_RW_ASYNC, HZ/10);
   +
   + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
   + if (dirty  0)
   + dirty = global_page_state(NR_UNSTABLE_NFS) +
   + global_page_state(NR_WRITEBACK);
  
  dirty is unsigned long. As mentioned last time, above will never be true?
  In general these patches look ok to me. I will do some testing with these.
 
 Re-introduced the same bug. My bad. :(
 
 The value returned from mem_cgroup_page_stat() can be negative, i.e.
 when memory cgroup is disabled. We could simply use a long for dirty,
 the unit is in # of pages so s64 should be enough. Or cast dirty to long
 only for the check (see below).
 
 Thanks!
 -Andrea
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  mm/page-writeback.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index d83f41c..dbee976 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
  
  
   dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
 - if (dirty  0)
 + if ((long)dirty  0)

This will also be problematic as on 32bit systems, your uppper limit of
dirty memory will be 2G?

I guess, I will prefer one of the two.

- return the error code from function and pass a pointer to store stats
  in as function argument.

- Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if
  per cgroup dirty control is enabled, then use per cgroup stats. In that
  case you don't have to return negative values.

  Only tricky part will be careful accouting so that none of the stats go
  negative in corner cases of migration etc.

Thanks
Vivek

   dirty = global_page_state(NR_UNSTABLE_NFS) +
   global_page_state(NR_WRITEBACK);
   if (dirty = dirty_thresh)
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Balbir Singh
* Peter Zijlstra pet...@infradead.org [2010-03-02 14:48:56]:

 This is ugly and broken.. I thought you'd agreed to something like:
 
  if (mem_cgroup_has_dirty_limit(cgroup))
use mem_cgroup numbers
  else
use global numbers
 
 That allows for a 0 dirty limit (which should work and basically makes
 all io synchronous).
 
 Also, I'd put each of those in a separate function, like:
 
 unsigned long reclaimable_pages(cgroup)
 {
   if (mem_cgroup_has_dirty_limit(cgroup))
 return mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
   
   return global_page_state(NR_FILE_DIRTY) + 
 global_page_state(NR_NFS_UNSTABLE);
 }


I agree, I should have been more specific about the naming convention,
this is what I meant - along these lines as we do with
zone_nr_lru_pages(), etc.
 
 Which raises another question, you should probably rebase on top of
 Trond's patches, which removes BDI_RECLAIMABLE, suggesting you also
 loose MEMCG_NR_RECLAIM_PAGES in favour of the DIRTY+UNSTABLE split.
 

-- 
Three Cheers,
Balbir
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Trond Myklebust
On Tue, 2010-03-02 at 14:48 +0100, Peter Zijlstra wrote: 
 unsigned long reclaimable_pages(cgroup)
 {
   if (mem_cgroup_has_dirty_limit(cgroup))
 return mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
   
   return global_page_state(NR_FILE_DIRTY) + 
 global_page_state(NR_NFS_UNSTABLE);
 }
 
 Which raises another question, you should probably rebase on top of
 Trond's patches, which removes BDI_RECLAIMABLE, suggesting you also
 loose MEMCG_NR_RECLAIM_PAGES in favour of the DIRTY+UNSTABLE split.
 

I'm dropping those patches for now. The main writeback change wasn't too
favourably received by the linux-mm community so I've implemented an
alternative that only changes the NFS layer, and doesn't depend on the
DIRTY+UNSTABLE split.

Cheers
  Trond

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Daisuke Nishimura
On Tue, 2 Mar 2010 23:18:23 +0100, Andrea Righi ari...@develer.com wrote:
 On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote:
  * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]:
  
   On Tue, 2 Mar 2010 09:01:58 +0100
   Andrea Righi ari...@develer.com wrote:
   
On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote:
 On Mon,  1 Mar 2010 22:23:40 +0100
 Andrea Righi ari...@develer.com wrote:
 
  Apply the cgroup dirty pages accounting and limiting infrastructure 
  to
  the opportune kernel functions.
  
  Signed-off-by: Andrea Righi ari...@develer.com
 
 Seems nice.
 
 Hmm. the last problem is moving account between memcg.
 
 Right ?

Correct. This was actually the last item of the TODO list. Anyway, I'm
still considering if it's correct to move dirty pages when a task is
migrated from a cgroup to another. Currently, dirty pages just remain in
the original cgroup and are flushed depending on the original cgroup
settings. That is not totally wrong... at least moving the dirty pages
between memcgs should be optional (move_charge_at_immigrate?).

   
   My concern is 
- migration between memcg is already suppoted
   - at task move
   - at rmdir
   
   Then, if you leave DIRTY_PAGE accounting to original cgroup,
   the new cgroup (migration target)'s Dirty page accounting may
   goes to be negative, or incorrect value. Please check FILE_MAPPED
   implementation in __mem_cgroup_move_account()
   
   As
  if (page_mapped(page)  !PageAnon(page)) {
   /* Update mapped_file data for mem_cgroup */
   preempt_disable();
   
   __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
   
   __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]);
   preempt_enable();
   }
   then, FILE_MAPPED never goes negative.
  
  
  Absolutely! I am not sure how complex dirty memory migration will be,
  but one way of working around it would be to disable migration of
  charges when the feature is enabled (dirty* is set in the memory
  cgroup). We might need additional logic to allow that to happen. 
 
 I've started to look at dirty memory migration. First attempt is to add
 DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in
 __mem_cgroup_move_account(). Probably I'll have something ready for the
 next version of the patch. I still need to figure if this can work as
 expected...
 
I agree it's a right direction(in fact, I have been planning to post a patch
in that direction), so I leave it to you.
Can you add PCG_FILE_MAPPED flag too ? I think this flag can be handled in the
same way as other flags you're trying to add, and we can change
if (page_mapped(page)  !PageAnon(page)) to if (PageCgroupFileMapped(pc)
in __mem_cgroup_move_account(). It would be cleaner than current code, IMHO.


Thanks,
Daisuke Nishimura.
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Vivek Goyal
On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote:
 On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote:
  On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote:
   On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote:
 @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
   */
  dirty_thresh += dirty_thresh / 10;  /* wh... 
 */
  
 -if (global_page_state(NR_UNSTABLE_NFS) +
 - global_page_state(NR_WRITEBACK) = dirty_thresh)
 - break;
 -congestion_wait(BLK_RW_ASYNC, HZ/10);
 +
 + dirty = 
 mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
 + if (dirty  0)
 + dirty = global_page_state(NR_UNSTABLE_NFS) +
 + global_page_state(NR_WRITEBACK);

dirty is unsigned long. As mentioned last time, above will never be 
true?
In general these patches look ok to me. I will do some testing with 
these.
   
   Re-introduced the same bug. My bad. :(
   
   The value returned from mem_cgroup_page_stat() can be negative, i.e.
   when memory cgroup is disabled. We could simply use a long for dirty,
   the unit is in # of pages so s64 should be enough. Or cast dirty to long
   only for the check (see below).
   
   Thanks!
   -Andrea
   
   Signed-off-by: Andrea Righi ari...@develer.com
   ---
mm/page-writeback.c |2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
   
   diff --git a/mm/page-writeback.c b/mm/page-writeback.c
   index d83f41c..dbee976 100644
   --- a/mm/page-writeback.c
   +++ b/mm/page-writeback.c
   @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)


 dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
   - if (dirty  0)
   + if ((long)dirty  0)
  
  This will also be problematic as on 32bit systems, your uppper limit of
  dirty memory will be 2G?
  
  I guess, I will prefer one of the two.
  
  - return the error code from function and pass a pointer to store stats
in as function argument.
  
  - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if
per cgroup dirty control is enabled, then use per cgroup stats. In that
case you don't have to return negative values.
  
Only tricky part will be careful accouting so that none of the stats go
negative in corner cases of migration etc.
 
 What do you think about Peter's suggestion + the locking stuff? (see the
 previous email). Otherwise, I'll choose the other solution, passing a
 pointer and always return the error code is not bad.
 

Ok, so you are worried about that by the we finish mem_cgroup_has_dirty_limit()
call, task might change cgroup and later we might call
mem_cgroup_get_page_stat() on a different cgroup altogether which might or
might not have dirty limits specified?

But in what cases you don't want to use memory cgroup specified limit? I
thought cgroup disabled what the only case where we need to use global
limits. Otherwise a memory cgroup will have either dirty_bytes specified
or by default inherit global dirty_ratio which is a valid number. If
that's the case then you don't have to take rcu_lock() outside
get_page_stat()?

IOW, apart from cgroup being disabled, what are the other cases where you
expect to not use cgroup's page stat and use global stats?

Thanks
Vivek

 Thanks,
 -Andrea
___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Daisuke Nishimura
 diff --git a/mm/filemap.c b/mm/filemap.c
 index fe09e51..f85acae 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
   if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
   dec_zone_page_state(page, NR_FILE_DIRTY);
   dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
   }
(snip)
 @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
  void account_page_dirtied(struct page *page, struct address_space *mapping)
  {
   if (mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
   __inc_zone_page_state(page, NR_FILE_DIRTY);
   __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
   task_dirty_inc(current);
As long as I can see, those two functions(at least) calls 
mem_cgroup_update_state(),
which acquires page cgroup lock, under mapping-tree_lock.
But as I fixed before in commit e767e056, page cgroup lock must not acquired 
under
mapping-tree_lock.
hmm, we should call those mem_cgroup_update_state() outside mapping-tree_lock,
or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid 
dead-lock.


Thanks,
Daisuke Nishimura.

On Mon,  1 Mar 2010 22:23:40 +0100, Andrea Righi ari...@develer.com wrote:
 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c  |5 +++
  fs/nfs/write.c  |4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c|1 +
  mm/page-writeback.c |   84 --
  mm/rmap.c   |4 +-
  mm/truncate.c   |2 +
  7 files changed, 76 insertions(+), 34 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
  
 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
  
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_update_stat(req-pages[0],
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
  
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_update_stat(tmp_page,
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
  
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
   req-wb_index,
   NFS_PAGE_TAG_COMMIT);
   spin_unlock(inode-i_lock);
 + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
   inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
   struct page *page = req-wb_page;
  
   if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(page, NR_UNSTABLE_NFS);
   dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
   return 1;
 @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
   req = nfs_list_entry(head-next);
   nfs_list_remove_request(req);
   nfs_mark_request_commit(req);
 + mem_cgroup_update_stat(req-wb_page,
 + MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
   BDI_UNSTABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index ada2f1b..aef6d13 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
   } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
   kunmap_atomic(kaddr, KM_USER0);
  
 - if (!TestSetPageWriteback(clone_page))
 + if (!TestSetPageWriteback(clone_page)) {
 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread KAMEZAWA Hiroyuki
On Wed, 3 Mar 2010 11:12:38 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

  diff --git a/mm/filemap.c b/mm/filemap.c
  index fe09e51..f85acae 100644
  --- a/mm/filemap.c
  +++ b/mm/filemap.c
  @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
   * having removed the page entirely.
   */
  if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
  +   mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
  dec_zone_page_state(page, NR_FILE_DIRTY);
  dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
  }
 (snip)
  @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
   void account_page_dirtied(struct page *page, struct address_space *mapping)
   {
  if (mapping_cap_account_dirty(mapping)) {
  +   mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
  __inc_zone_page_state(page, NR_FILE_DIRTY);
  __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
  task_dirty_inc(current);
 As long as I can see, those two functions(at least) calls 
 mem_cgroup_update_state(),
 which acquires page cgroup lock, under mapping-tree_lock.
 But as I fixed before in commit e767e056, page cgroup lock must not acquired 
 under
 mapping-tree_lock.
 hmm, we should call those mem_cgroup_update_state() outside 
 mapping-tree_lock,
 or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid 
 dead-lock.
 
Ah, good catch! But hmm...
This account_page_dirtted() seems to be called under IRQ-disabled.
About  __remove_from_page_cache(), I think page_cgroup should have its own 
DIRTY flag,
then, mem_cgroup_uncharge_page() can handle it automatically.

But. there are no guarantee that following never happens. 
lock_page_cgroup()
=== interrupt.
- mapping-tree_lock()
Even if mapping-tree_lock is held with IRQ-disabled.
Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().

Then, hm...some kind of new trick ? as..
(Follwoing patch is not tested!!)

==
---
 include/linux/page_cgroup.h |   14 ++
 mm/memcontrol.c |   27 +--
 2 files changed, 31 insertions(+), 10 deletions(-)

Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
===
--- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h
+++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h
@@ -39,6 +39,7 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
+   PCG_MIGRATE, /* page cgroup is under memcg account migration */
 };
 
 #define TESTPCGFLAG(uname, lname)  \
@@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
 TESTPCGFLAG(AcctLRU, ACCT_LRU)
 TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
+TESTPCGFLAG(Migrate, MIGRATE)
+
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
return page_to_nid(pc-page);
@@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, pc-flags);
 }
 
+static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc)
+{
+   local_irq_save(flags);
+   bit_spin_lock(PCG_MIGRATE, pc-flags);
+}
+static inline void
+page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags)
+{
+   bit_spin_lock(PCG_MIGRATE, pc-flags);
+   local_irq_restore(flags);
+}
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_file_mapped(struct page *page, int val, int locked)
 {
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
-
-   lock_page_cgroup(pc);
+   /*
+* if locked==1, mapping-tree_lock is held. We don't have to take
+* care of charge/uncharge. just think about migration.
+*/
+   if (!locked)
+   lock_page_cgroup(pc);
+   else
+   page_cgroup_migration_lock(pc);
mem = pc-mem_cgroup;
-   if (!mem)
+   if (!mem || !PageCgroupUsed(pc))
goto done;
-
-   if (!PageCgroupUsed(pc))
-   goto done;
-
/*
 * Preemption is already disabled. We can use __this_cpu_xxx
 */
__this_cpu_add(mem-stat-count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
 done:
-   

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread Daisuke Nishimura
On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki 
kamezawa.hir...@jp.fujitsu.com wrote:
 On Wed, 3 Mar 2010 11:12:38 +0900
 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
 
   diff --git a/mm/filemap.c b/mm/filemap.c
   index fe09e51..f85acae 100644
   --- a/mm/filemap.c
   +++ b/mm/filemap.c
   @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
  * having removed the page entirely.
  */
 if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
   + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
 dec_zone_page_state(page, NR_FILE_DIRTY);
 dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
 }
  (snip)
   @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space 
   *mapping)
{
 if (mapping_cap_account_dirty(mapping)) {
   + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
 __inc_zone_page_state(page, NR_FILE_DIRTY);
 __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
 task_dirty_inc(current);
  As long as I can see, those two functions(at least) calls 
  mem_cgroup_update_state(),
  which acquires page cgroup lock, under mapping-tree_lock.
  But as I fixed before in commit e767e056, page cgroup lock must not 
  acquired under
  mapping-tree_lock.
  hmm, we should call those mem_cgroup_update_state() outside 
  mapping-tree_lock,
  or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid 
  dead-lock.
  
 Ah, good catch! But hmm...
 This account_page_dirtted() seems to be called under IRQ-disabled.
 About  __remove_from_page_cache(), I think page_cgroup should have its own 
 DIRTY flag,
 then, mem_cgroup_uncharge_page() can handle it automatically.
 
 But. there are no guarantee that following never happens. 
   lock_page_cgroup()
   === interrupt.
   - mapping-tree_lock()
 Even if mapping-tree_lock is held with IRQ-disabled.
 Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup().
 
 Then, hm...some kind of new trick ? as..
 (Follwoing patch is not tested!!)
 
If we can verify that all callers of mem_cgroup_update_stat() have always 
either aquired
or not aquired tree_lock, this direction will work fine.
But if we can't, we have to add local_irq_save() to lock_page_cgroup() like 
below.

===
 include/linux/page_cgroup.h |8 ++--
 mm/memcontrol.c |   43 +--
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 30b0813..51da916 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -83,15 +83,19 @@ static inline enum zone_type page_cgroup_zid(struct 
page_cgroup *pc)
return page_zonenum(pc-page);
 }
 
-static inline void lock_page_cgroup(struct page_cgroup *pc)
+static inline void __lock_page_cgroup(struct page_cgroup *pc)
 {
bit_spin_lock(PCG_LOCK, pc-flags);
 }
+#define lock_page_cgroup(pc, flags) \
+  do { local_irq_save(flags); __lock_page_cgroup(pc); } while (0)
 
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline void __unlock_page_cgroup(struct page_cgroup *pc)
 {
bit_spin_unlock(PCG_LOCK, pc-flags);
 }
+#define unlock_page_cgroup(pc, flags) \
+  do { __unlock_page_cgroup(pc); local_irq_restore(flags); } while (0)
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00ed4b1..40b9be4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1327,12 +1327,13 @@ void mem_cgroup_update_file_mapped(struct page *page, 
int val)
 {
struct mem_cgroup *mem;
struct page_cgroup *pc;
+   unsigned long flags;
 
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
 
-   lock_page_cgroup(pc);
+   lock_page_cgroup(pc, flags);
mem = pc-mem_cgroup;
if (!mem)
goto done;
@@ -1346,7 +1347,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int 
val)
__this_cpu_add(mem-stat-count[MEM_CGROUP_STAT_FILE_MAPPED], val);
 
 done:
-   unlock_page_cgroup(pc);
+   unlock_page_cgroup(pc, flags);
 }
 
 /*
@@ -1680,11 +1681,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
page *page)
struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
+   unsigned long flags;
 
VM_BUG_ON(!PageLocked(page));
 
pc = lookup_page_cgroup(page);
-   lock_page_cgroup(pc);
+   lock_page_cgroup(pc, flags);
if (PageCgroupUsed(pc)) {
mem = pc-mem_cgroup;
if (mem  !css_tryget(mem-css))
@@ -1698,7 +1700,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
page *page)
mem = NULL;
rcu_read_unlock();
}
-   

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-02 Thread KAMEZAWA Hiroyuki
On Wed, 3 Mar 2010 15:01:37 +0900
Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:

 On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki 
 kamezawa.hir...@jp.fujitsu.com wrote:
  On Wed, 3 Mar 2010 11:12:38 +0900
  Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote:
  
diff --git a/mm/filemap.c b/mm/filemap.c
index fe09e51..f85acae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
 * having removed the page entirely.
 */
if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
+   mem_cgroup_update_stat(page, 
MEM_CGROUP_STAT_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
}
   (snip)
@@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page 
*page)
 void account_page_dirtied(struct page *page, struct address_space 
*mapping)
 {
if (mapping_cap_account_dirty(mapping)) {
+   mem_cgroup_update_stat(page, 
MEM_CGROUP_STAT_FILE_DIRTY, 1);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
task_dirty_inc(current);
   As long as I can see, those two functions(at least) calls 
   mem_cgroup_update_state(),
   which acquires page cgroup lock, under mapping-tree_lock.
   But as I fixed before in commit e767e056, page cgroup lock must not 
   acquired under
   mapping-tree_lock.
   hmm, we should call those mem_cgroup_update_state() outside 
   mapping-tree_lock,
   or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid 
   dead-lock.
   
  Ah, good catch! But hmm...
  This account_page_dirtted() seems to be called under IRQ-disabled.
  About  __remove_from_page_cache(), I think page_cgroup should have its own 
  DIRTY flag,
  then, mem_cgroup_uncharge_page() can handle it automatically.
  
  But. there are no guarantee that following never happens. 
  lock_page_cgroup()
  === interrupt.
  - mapping-tree_lock()
  Even if mapping-tree_lock is held with IRQ-disabled.
  Then, if we add local_irq_save(), we have to add it to all 
  lock_page_cgroup().
  
  Then, hm...some kind of new trick ? as..
  (Follwoing patch is not tested!!)
  
 If we can verify that all callers of mem_cgroup_update_stat() have always 
 either aquired
 or not aquired tree_lock, this direction will work fine.
 But if we can't, we have to add local_irq_save() to lock_page_cgroup() like 
 below.
 

Agreed.
Let's try how we can write a code in clean way. (we have time ;)
For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little
over killing. What I really want is lockless code...but it seems impossible
under current implementation.

I wonder the fact the page is never unchareged under us can give us some 
chances
...Hmm.

Thanks,
-Kame

___
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

___
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel


[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-01 Thread Vivek Goyal
On Mon, Mar 01, 2010 at 10:23:40PM +0100, Andrea Righi wrote:
 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com
 ---
  fs/fuse/file.c  |5 +++
  fs/nfs/write.c  |4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c|1 +
  mm/page-writeback.c |   84 --
  mm/rmap.c   |4 +-
  mm/truncate.c   |2 +
  7 files changed, 76 insertions(+), 34 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
  
 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
  
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_update_stat(req-pages[0],
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
  
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_update_stat(tmp_page,
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
  
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
   req-wb_index,
   NFS_PAGE_TAG_COMMIT);
   spin_unlock(inode-i_lock);
 + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
   inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
   struct page *page = req-wb_page;
  
   if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(page, NR_UNSTABLE_NFS);
   dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
   return 1;
 @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
   req = nfs_list_entry(head-next);
   nfs_list_remove_request(req);
   nfs_mark_request_commit(req);
 + mem_cgroup_update_stat(req-wb_page,
 + MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
   BDI_UNSTABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index ada2f1b..aef6d13 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
   } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
   kunmap_atomic(kaddr, KM_USER0);
  
 - if (!TestSetPageWriteback(clone_page))
 + if (!TestSetPageWriteback(clone_page)) {
 + mem_cgroup_update_stat(clone_page,
 + MEM_CGROUP_STAT_WRITEBACK, 1);
   inc_zone_page_state(clone_page, NR_WRITEBACK);
 + }
   unlock_page(clone_page);
  
   return 0;
 @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int 
 err)
   }
  
   if (buffer_nilfs_allocated(page_buffers(page))) {
 - if (TestClearPageWriteback(page))
 + if (TestClearPageWriteback(page)) {
 + mem_cgroup_update_stat(clone_page,
 + MEM_CGROUP_STAT_WRITEBACK, -1);
   dec_zone_page_state(page, NR_WRITEBACK);
 + }
   } else
   end_page_writeback(page);
  }
 diff --git a/mm/filemap.c b/mm/filemap.c
 index fe09e51..f85acae 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
   if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
   dec_zone_page_state(page, NR_FILE_DIRTY);
   dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
   }
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 5a0f8f3..d83f41c 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -137,13 

[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation

2010-03-01 Thread KAMEZAWA Hiroyuki
On Mon,  1 Mar 2010 22:23:40 +0100
Andrea Righi ari...@develer.com wrote:

 Apply the cgroup dirty pages accounting and limiting infrastructure to
 the opportune kernel functions.
 
 Signed-off-by: Andrea Righi ari...@develer.com

Seems nice.

Hmm. the last problem is moving account between memcg.

Right ?

Thanks,
-Kame


 ---
  fs/fuse/file.c  |5 +++
  fs/nfs/write.c  |4 ++
  fs/nilfs2/segment.c |   10 +-
  mm/filemap.c|1 +
  mm/page-writeback.c |   84 --
  mm/rmap.c   |4 +-
  mm/truncate.c   |2 +
  7 files changed, 76 insertions(+), 34 deletions(-)
 
 diff --git a/fs/fuse/file.c b/fs/fuse/file.c
 index a9f5e13..dbbdd53 100644
 --- a/fs/fuse/file.c
 +++ b/fs/fuse/file.c
 @@ -11,6 +11,7 @@
  #include linux/pagemap.h
  #include linux/slab.h
  #include linux/kernel.h
 +#include linux/memcontrol.h
  #include linux/sched.h
  #include linux/module.h
  
 @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, 
 struct fuse_req *req)
  
   list_del(req-writepages_entry);
   dec_bdi_stat(bdi, BDI_WRITEBACK);
 + mem_cgroup_update_stat(req-pages[0],
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
   dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP);
   bdi_writeout_inc(bdi);
   wake_up(fi-page_waitq);
 @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
   req-inode = inode;
  
   inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK);
 + mem_cgroup_update_stat(tmp_page,
 + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
   inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
   end_page_writeback(page);
  
 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
 index b753242..7316f7a 100644
 --- a/fs/nfs/write.c
 +++ b/fs/nfs/write.c
 @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
   req-wb_index,
   NFS_PAGE_TAG_COMMIT);
   spin_unlock(inode-i_lock);
 + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
   inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE);
   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
   struct page *page = req-wb_page;
  
   if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(page, NR_UNSTABLE_NFS);
   dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE);
   return 1;
 @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head 
 *head, int how)
   req = nfs_list_entry(head-next);
   nfs_list_remove_request(req);
   nfs_mark_request_commit(req);
 + mem_cgroup_update_stat(req-wb_page,
 + MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
   dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS);
   dec_bdi_stat(req-wb_page-mapping-backing_dev_info,
   BDI_UNSTABLE);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index ada2f1b..aef6d13 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, 
 struct list_head *out)
   } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head);
   kunmap_atomic(kaddr, KM_USER0);
  
 - if (!TestSetPageWriteback(clone_page))
 + if (!TestSetPageWriteback(clone_page)) {
 + mem_cgroup_update_stat(clone_page,
 + MEM_CGROUP_STAT_WRITEBACK, 1);
   inc_zone_page_state(clone_page, NR_WRITEBACK);
 + }
   unlock_page(clone_page);
  
   return 0;
 @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int 
 err)
   }
  
   if (buffer_nilfs_allocated(page_buffers(page))) {
 - if (TestClearPageWriteback(page))
 + if (TestClearPageWriteback(page)) {
 + mem_cgroup_update_stat(clone_page,
 + MEM_CGROUP_STAT_WRITEBACK, -1);
   dec_zone_page_state(page, NR_WRITEBACK);
 + }
   } else
   end_page_writeback(page);
  }
 diff --git a/mm/filemap.c b/mm/filemap.c
 index fe09e51..f85acae 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
 @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
   if (PageDirty(page)  mapping_cap_account_dirty(mapping)) {
 + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
   dec_zone_page_state(page, NR_FILE_DIRTY);
   dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY);
   }
 diff --git a/mm/page-writeback.c