[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) dirty = global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK); if (dirty = dirty_thresh) ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). Thanks for your ack and the detailed review! -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote: On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c | 5 +++ fs/nfs/write.c | 4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c | 1 + mm/page-writeback.c | 84 -- mm/rmap.c | 4 +- mm/truncate.c | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, s/clone_page/page/ mmh... shouldn't we use the same page used by TestSetPageWriteback() and inc_zone_page_state()? And #include linux/memcontrol.h is missed. OK. I'll apply your fixes and post a new version. Thanks for reviewing, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 01:09:24PM +0200, Kirill A. Shutemov wrote: On Tue, Mar 2, 2010 at 1:02 PM, Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote: On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c | 5 +++ fs/nfs/write.c | 4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c | 1 + mm/page-writeback.c | 84 -- mm/rmap.c | 4 +- mm/truncate.c | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, s/clone_page/page/ mmh... shouldn't we use the same page used by TestSetPageWriteback() and inc_zone_page_state()? Sorry, I've commented wrong hunk. It's for the next one. Yes. Good catch! Will fix in the next version. Thanks, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 02:48:56PM +0100, Peter Zijlstra wrote: On Mon, 2010-03-01 at 22:23 +0100, Andrea Righi wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5a0f8f3..d83f41c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties; */ static int calc_period_shift(void) { - unsigned long dirty_total; + unsigned long dirty_total, dirty_bytes; - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; + dirty_bytes = mem_cgroup_dirty_bytes(); + if (dirty_bytes) So you don't think 0 is a valid max dirty amount? A value of 0 means disabled. It's used to select between dirty_ratio or dirty_bytes. It's the same for the gloabl vm_dirty_* parameters. + dirty_total = dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / - 100; + dirty_total = (mem_cgroup_dirty_ratio() * + determine_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) */ unsigned long determine_dirtyable_memory(void) { - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + unsigned long memory; + s64 memcg_memory; + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ + memory -= highmem_dirtyable_memory(memory); + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES); + if (memcg_memory 0) And here you somehow return negative? + return memory + 1; + return min((unsigned long)memcg_memory, memory + 1); } void @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { unsigned long background; - unsigned long dirty; + unsigned long dirty, dirty_bytes, dirty_background; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + dirty_bytes = mem_cgroup_dirty_bytes(); + if (dirty_bytes) zero not valid again + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE); else { int dirty_ratio; - dirty_ratio = vm_dirty_ratio; + dirty_ratio = mem_cgroup_dirty_ratio(); if (dirty_ratio 5) dirty_ratio = 5; dirty = (dirty_ratio * available_memory) / 100; } - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + dirty_background = mem_cgroup_dirty_background_bytes(); + if (dirty_background) idem + background = DIV_ROUND_UP(dirty_background, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; - + background = (mem_cgroup_dirty_background_ratio() * + available_memory) / 100; if (background = dirty) background = dirty / 2; tsk = current; @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space *mapping, get_dirty_limits(background_thresh, dirty_thresh, bdi_thresh, bdi); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); + nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK); + if ((nr_reclaimable 0) || (nr_writeback 0)) { + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); ??? why would a page_state be negative.. I see you return -ENOMEM on ! cgroup, but how can one specify no dirty limit with this compiled in? - nr_writeback = global_page_state(NR_WRITEBACK); + nr_writeback = global_page_state(NR_WRITEBACK); + } bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); if (bdi_cap_account_unstable(bdi)) { @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space *mapping, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ + nr_reclaimable =
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote: * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]: On Tue, 2 Mar 2010 09:01:58 +0100 Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). My concern is - migration between memcg is already suppoted - at task move - at rmdir Then, if you leave DIRTY_PAGE accounting to original cgroup, the new cgroup (migration target)'s Dirty page accounting may goes to be negative, or incorrect value. Please check FILE_MAPPED implementation in __mem_cgroup_move_account() As if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } then, FILE_MAPPED never goes negative. Absolutely! I am not sure how complex dirty memory migration will be, but one way of working around it would be to disable migration of charges when the feature is enabled (dirty* is set in the memory cgroup). We might need additional logic to allow that to happen. I've started to look at dirty memory migration. First attempt is to add DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in __mem_cgroup_move_account(). Probably I'll have something ready for the next version of the patch. I still need to figure if this can work as expected... -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) This will also be problematic as on 32bit systems, your uppper limit of dirty memory will be 2G? I guess, I will prefer one of the two. - return the error code from function and pass a pointer to store stats in as function argument. - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if per cgroup dirty control is enabled, then use per cgroup stats. In that case you don't have to return negative values. Only tricky part will be careful accouting so that none of the stats go negative in corner cases of migration etc. What do you think about Peter's suggestion + the locking stuff? (see the previous email). Otherwise, I'll choose the other solution, passing a pointer and always return the error code is not bad. Thanks, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 06:59:32PM -0500, Vivek Goyal wrote: On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote: On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) This will also be problematic as on 32bit systems, your uppper limit of dirty memory will be 2G? I guess, I will prefer one of the two. - return the error code from function and pass a pointer to store stats in as function argument. - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if per cgroup dirty control is enabled, then use per cgroup stats. In that case you don't have to return negative values. Only tricky part will be careful accouting so that none of the stats go negative in corner cases of migration etc. What do you think about Peter's suggestion + the locking stuff? (see the previous email). Otherwise, I'll choose the other solution, passing a pointer and always return the error code is not bad. Ok, so you are worried about that by the we finish mem_cgroup_has_dirty_limit() call, task might change cgroup and later we might call mem_cgroup_get_page_stat() on a different cgroup altogether which might or might not have dirty limits specified? Correct. But in what cases you don't want to use memory cgroup specified limit? I thought cgroup disabled what the only case where we need to use global limits. Otherwise a memory cgroup will have either dirty_bytes specified or by default inherit global dirty_ratio which is a valid number. If that's the case then you don't have to take rcu_lock() outside get_page_stat()? IOW, apart from cgroup being disabled, what are the other cases where you expect to not use cgroup's page stat and use global stats? At boot, when mem_cgroup_from_task() may return NULL. But this is not related to the RCU acquisition. Anyway, probably the RCU protection is not so critical for this particular case, and we can simply get rid of it. In this way we can easily implement the interface proposed by Peter. -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, Mar 03, 2010 at 08:21:07AM +0900, Daisuke Nishimura wrote: On Tue, 2 Mar 2010 23:18:23 +0100, Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote: * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]: On Tue, 2 Mar 2010 09:01:58 +0100 Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). My concern is - migration between memcg is already suppoted - at task move - at rmdir Then, if you leave DIRTY_PAGE accounting to original cgroup, the new cgroup (migration target)'s Dirty page accounting may goes to be negative, or incorrect value. Please check FILE_MAPPED implementation in __mem_cgroup_move_account() As if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } then, FILE_MAPPED never goes negative. Absolutely! I am not sure how complex dirty memory migration will be, but one way of working around it would be to disable migration of charges when the feature is enabled (dirty* is set in the memory cgroup). We might need additional logic to allow that to happen. I've started to look at dirty memory migration. First attempt is to add DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in __mem_cgroup_move_account(). Probably I'll have something ready for the next version of the patch. I still need to figure if this can work as expected... I agree it's a right direction(in fact, I have been planning to post a patch in that direction), so I leave it to you. Can you add PCG_FILE_MAPPED flag too ? I think this flag can be handled in the same way as other flags you're trying to add, and we can change if (page_mapped(page) !PageAnon(page)) to if (PageCgroupFileMapped(pc) in __mem_cgroup_move_account(). It would be cleaner than current code, IMHO. OK, sounds good to me. I'll introduce PCG_FILE_MAPPED in the next version. Thanks, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote: On Wed, 3 Mar 2010 15:15:49 +0900 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: Agreed. Let's try how we can write a code in clean way. (we have time ;) For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little over killing. What I really want is lockless code...but it seems impossible under current implementation. I wonder the fact the page is never unchareged under us can give us some chances ...Hmm. How about this ? Basically, I don't like duplicating information...so, # of new pcg_flags may be able to be reduced. I'm glad this can be a hint for Andrea-san. Many thanks! I already wrote pretty the same code, but at this point I think I'll just apply and test this one. ;) -Andrea == --- include/linux/page_cgroup.h | 44 - mm/memcontrol.c | 91 +++- 2 files changed, 132 insertions(+), 3 deletions(-) Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h === --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h @@ -39,6 +39,11 @@ enum { PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */ + PCG_ACCT_DIRTY, + PCG_ACCT_WB, + PCG_ACCT_WB_TEMP, + PCG_ACCT_UNSTABLE, }; #define TESTPCGFLAG(uname, lname)\ @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU) TESTPCGFLAG(AcctLRU, ACCT_LRU) TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) +SETPCGFLAG(AcctDirty, ACCT_DIRTY); +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY); +TESTPCGFLAG(AcctDirty, ACCT_DIRTY); + +SETPCGFLAG(AcctWB, ACCT_WB); +CLEARPCGFLAG(AcctWB, ACCT_WB); +TESTPCGFLAG(AcctWB, ACCT_WB); + +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); + +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); + + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc-page); @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup { return page_zonenum(pc-page); } - +/* + * lock_page_cgroup() should not be held under mapping-tree_lock + */ static inline void lock_page_cgroup(struct page_cgroup *pc) { bit_spin_lock(PCG_LOCK, pc-flags); @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st bit_spin_unlock(PCG_LOCK, pc-flags); } +/* + * Lock order is + * lock_page_cgroup() + * lock_page_cgroup_migrate() + * This lock is not be lock for charge/uncharge but for account moving. + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself + * the page is uncharged while we hold this. + */ +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags); +} + +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; Index: mmotm-2.6.33-Mar2/mm/memcontrol.c === --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c +++ mmotm-2.6.33-Mar2/mm/memcontrol.c @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DIRTY, + MEM_CGROUP_STAT_WBACK, + MEM_CGROUP_STAT_WBACK_TEMP, + MEM_CGROUP_STAT_UNSTABLE_NFS, MEM_CGROUP_STAT_NSTATS, }; @@ -1360,6 +1364,86 @@ done: } /* + * Update file cache's status for memcg. Before calling this, + * mapping-tree_lock should be held and preemption is disabled. + * Then, it's guarnteed that the page is not uncharged while we + * access page_cgroup. We can make use of that. + */ +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + + pc = lookup_page_cgroup(page); + /* Not accounted ? */ + if (!PageCgroupUsed(pc)) + return; + lock_page_cgroup_migrate(pc); + /* + * It's guarnteed that this page is never uncharged. + * The only racy problem is moving account among memcgs. + */ + switch (idx) { + case MEM_CGROUP_STAT_DIRTY: + if (set) + SetPageCgroupAcctDirty(pc); + else + ClearPageCgroupAcctDirty(pc); +
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, Mar 03, 2010 at 12:47:03PM +0100, Andrea Righi wrote: On Tue, Mar 02, 2010 at 06:59:32PM -0500, Vivek Goyal wrote: On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote: On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) This will also be problematic as on 32bit systems, your uppper limit of dirty memory will be 2G? I guess, I will prefer one of the two. - return the error code from function and pass a pointer to store stats in as function argument. - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if per cgroup dirty control is enabled, then use per cgroup stats. In that case you don't have to return negative values. Only tricky part will be careful accouting so that none of the stats go negative in corner cases of migration etc. What do you think about Peter's suggestion + the locking stuff? (see the previous email). Otherwise, I'll choose the other solution, passing a pointer and always return the error code is not bad. Ok, so you are worried about that by the we finish mem_cgroup_has_dirty_limit() call, task might change cgroup and later we might call mem_cgroup_get_page_stat() on a different cgroup altogether which might or might not have dirty limits specified? Correct. But in what cases you don't want to use memory cgroup specified limit? I thought cgroup disabled what the only case where we need to use global limits. Otherwise a memory cgroup will have either dirty_bytes specified or by default inherit global dirty_ratio which is a valid number. If that's the case then you don't have to take rcu_lock() outside get_page_stat()? IOW, apart from cgroup being disabled, what are the other cases where you expect to not use cgroup's page stat and use global stats? At boot, when mem_cgroup_from_task() may return NULL. But this is not related to the RCU acquisition. Nevermind. You're right. In any case even if a task is migrated to a different cgroup it will always have mem_cgroup_has_dirty_limit() == true. So RCU protection is not needed outside these functions. OK, I'll go with the Peter's suggestion. Thanks! -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, Mar 03, 2010 at 11:07:35AM +0100, Peter Zijlstra wrote: On Tue, 2010-03-02 at 23:14 +0100, Andrea Righi wrote: I agree mem_cgroup_has_dirty_limit() is nicer. But we must do that under RCU, so something like: rcu_read_lock(); if (mem_cgroup_has_dirty_limit()) mem_cgroup_get_page_stat() else global_page_state() rcu_read_unlock(); That is bad when mem_cgroup_has_dirty_limit() always returns false (e.g., when memory cgroups are disabled). So I fallback to the old interface. Why is it that mem_cgroup_has_dirty_limit() needs RCU when mem_cgroup_get_page_stat() doesn't? That is, simply make mem_cgroup_has_dirty_limit() not require RCU in the same way *_get_page_stat() doesn't either. OK, I agree we can get rid of RCU protection here (see my previous email). BTW the point was that after mem_cgroup_has_dirty_limit() the task might be moved to another cgroup, but also in this case mem_cgroup_has_dirty_limit() will be always true, so mem_cgroup_get_page_stat() is always coherent. What do you think about: mem_cgroup_lock(); if (mem_cgroup_has_dirty_limit()) mem_cgroup_get_page_stat() else global_page_state() mem_cgroup_unlock(); Where mem_cgroup_read_lock/unlock() simply expand to nothing when memory cgroups are disabled. I think you're engineering the wrong way around. That allows for a 0 dirty limit (which should work and basically makes all io synchronous). IMHO it is better to reserve 0 for the special value disabled like the global settings. A synchronous IO can be also achieved using a dirty limit of 1. Why?! 0 clearly states no writeback cache, IOW sync writes, a 1 byte/page writeback cache effectively reduces to the same thing, but its not the same thing conceptually. If you want to put the size and enable into a single variable pick -1 for disable or so. I might agree, and actually I prefer this solution.. but in this way we would use a different interface respect to the equivalent vm_dirty_ratio / vm_dirty_bytes global settings (as well as dirty_background_ratio / dirty_background_bytes). IMHO it's better to use the same interface to avoid user misunderstandings. Thanks, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote: On Wed, 3 Mar 2010 15:15:49 +0900 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: Agreed. Let's try how we can write a code in clean way. (we have time ;) For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little over killing. What I really want is lockless code...but it seems impossible under current implementation. I wonder the fact the page is never unchareged under us can give us some chances ...Hmm. How about this ? Basically, I don't like duplicating information...so, # of new pcg_flags may be able to be reduced. I'm glad this can be a hint for Andrea-san. == --- include/linux/page_cgroup.h | 44 - mm/memcontrol.c | 91 +++- 2 files changed, 132 insertions(+), 3 deletions(-) Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h === --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h @@ -39,6 +39,11 @@ enum { PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */ + PCG_ACCT_DIRTY, + PCG_ACCT_WB, + PCG_ACCT_WB_TEMP, + PCG_ACCT_UNSTABLE, }; #define TESTPCGFLAG(uname, lname)\ @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU) TESTPCGFLAG(AcctLRU, ACCT_LRU) TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) +SETPCGFLAG(AcctDirty, ACCT_DIRTY); +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY); +TESTPCGFLAG(AcctDirty, ACCT_DIRTY); + +SETPCGFLAG(AcctWB, ACCT_WB); +CLEARPCGFLAG(AcctWB, ACCT_WB); +TESTPCGFLAG(AcctWB, ACCT_WB); + +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); + +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); + + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc-page); @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup { return page_zonenum(pc-page); } - +/* + * lock_page_cgroup() should not be held under mapping-tree_lock + */ static inline void lock_page_cgroup(struct page_cgroup *pc) { bit_spin_lock(PCG_LOCK, pc-flags); @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st bit_spin_unlock(PCG_LOCK, pc-flags); } +/* + * Lock order is + * lock_page_cgroup() + * lock_page_cgroup_migrate() + * This lock is not be lock for charge/uncharge but for account moving. + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself + * the page is uncharged while we hold this. + */ +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags); +} + +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; Index: mmotm-2.6.33-Mar2/mm/memcontrol.c === --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c +++ mmotm-2.6.33-Mar2/mm/memcontrol.c @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DIRTY, + MEM_CGROUP_STAT_WBACK, + MEM_CGROUP_STAT_WBACK_TEMP, + MEM_CGROUP_STAT_UNSTABLE_NFS, MEM_CGROUP_STAT_NSTATS, }; @@ -1360,6 +1364,86 @@ done: } /* + * Update file cache's status for memcg. Before calling this, + * mapping-tree_lock should be held and preemption is disabled. + * Then, it's guarnteed that the page is not uncharged while we + * access page_cgroup. We can make use of that. + */ +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + + pc = lookup_page_cgroup(page); + /* Not accounted ? */ + if (!PageCgroupUsed(pc)) + return; + lock_page_cgroup_migrate(pc); + /* + * It's guarnteed that this page is never uncharged. + * The only racy problem is moving account among memcgs. + */ + switch (idx) { + case MEM_CGROUP_STAT_DIRTY: + if (set) + SetPageCgroupAcctDirty(pc); + else + ClearPageCgroupAcctDirty(pc); + break; + case MEM_CGROUP_STAT_WBACK: + if (set) + SetPageCgroupAcctWB(pc); +
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 15:15:49 +0900 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: Agreed. Let's try how we can write a code in clean way. (we have time ;) For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little over killing. What I really want is lockless code...but it seems impossible under current implementation. I wonder the fact the page is never unchareged under us can give us some chances ...Hmm. How about this ? Basically, I don't like duplicating information...so, # of new pcg_flags may be able to be reduced. I'm glad this can be a hint for Andrea-san. == --- include/linux/page_cgroup.h | 44 - mm/memcontrol.c | 91 +++- 2 files changed, 132 insertions(+), 3 deletions(-) Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h === --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h @@ -39,6 +39,11 @@ enum { PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */ + PCG_ACCT_DIRTY, + PCG_ACCT_WB, + PCG_ACCT_WB_TEMP, + PCG_ACCT_UNSTABLE, }; #define TESTPCGFLAG(uname, lname) \ @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU) TESTPCGFLAG(AcctLRU, ACCT_LRU) TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) +SETPCGFLAG(AcctDirty, ACCT_DIRTY); +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY); +TESTPCGFLAG(AcctDirty, ACCT_DIRTY); + +SETPCGFLAG(AcctWB, ACCT_WB); +CLEARPCGFLAG(AcctWB, ACCT_WB); +TESTPCGFLAG(AcctWB, ACCT_WB); + +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); + +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); + + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc-page); @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup { return page_zonenum(pc-page); } - +/* + * lock_page_cgroup() should not be held under mapping-tree_lock + */ static inline void lock_page_cgroup(struct page_cgroup *pc) { bit_spin_lock(PCG_LOCK, pc-flags); @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st bit_spin_unlock(PCG_LOCK, pc-flags); } +/* + * Lock order is + * lock_page_cgroup() + * lock_page_cgroup_migrate() + * This lock is not be lock for charge/uncharge but for account moving. + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself + * the page is uncharged while we hold this. + */ +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags); +} + +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; Index: mmotm-2.6.33-Mar2/mm/memcontrol.c === --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c +++ mmotm-2.6.33-Mar2/mm/memcontrol.c @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DIRTY, + MEM_CGROUP_STAT_WBACK, + MEM_CGROUP_STAT_WBACK_TEMP, + MEM_CGROUP_STAT_UNSTABLE_NFS, MEM_CGROUP_STAT_NSTATS, }; @@ -1360,6 +1364,86 @@ done: } /* + * Update file cache's status for memcg. Before calling this, + * mapping-tree_lock should be held and preemption is disabled. + * Then, it's guarnteed that the page is not uncharged while we + * access page_cgroup. We can make use of that. + */ +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + + pc = lookup_page_cgroup(page); + /* Not accounted ? */ + if (!PageCgroupUsed(pc)) + return; + lock_page_cgroup_migrate(pc); + /* +* It's guarnteed that this page is never uncharged. +* The only racy problem is moving account among memcgs. +*/ + switch (idx) { + case MEM_CGROUP_STAT_DIRTY: + if (set) + SetPageCgroupAcctDirty(pc); + else + ClearPageCgroupAcctDirty(pc); + break; + case MEM_CGROUP_STAT_WBACK: + if (set) + SetPageCgroupAcctWB(pc); + else + ClearPageCgroupAcctWB(pc); + break; + case MEM_CGROUP_STAT_WBACK_TEMP: +
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 23:03:19 +0100, Andrea Righi ari...@develer.com wrote: On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote: On Wed, 3 Mar 2010 15:15:49 +0900 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: Agreed. Let's try how we can write a code in clean way. (we have time ;) For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little over killing. What I really want is lockless code...but it seems impossible under current implementation. I wonder the fact the page is never unchareged under us can give us some chances ...Hmm. How about this ? Basically, I don't like duplicating information...so, # of new pcg_flags may be able to be reduced. I'm glad this can be a hint for Andrea-san. == --- include/linux/page_cgroup.h | 44 - mm/memcontrol.c | 91 +++- 2 files changed, 132 insertions(+), 3 deletions(-) Index: mmotm-2.6.33-Mar2/include/linux/page_cgroup.h === --- mmotm-2.6.33-Mar2.orig/include/linux/page_cgroup.h +++ mmotm-2.6.33-Mar2/include/linux/page_cgroup.h @@ -39,6 +39,11 @@ enum { PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATE_LOCK, /* used for mutual execution of account migration */ + PCG_ACCT_DIRTY, + PCG_ACCT_WB, + PCG_ACCT_WB_TEMP, + PCG_ACCT_UNSTABLE, }; #define TESTPCGFLAG(uname, lname) \ @@ -73,6 +78,23 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU) TESTPCGFLAG(AcctLRU, ACCT_LRU) TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) +SETPCGFLAG(AcctDirty, ACCT_DIRTY); +CLEARPCGFLAG(AcctDirty, ACCT_DIRTY); +TESTPCGFLAG(AcctDirty, ACCT_DIRTY); + +SETPCGFLAG(AcctWB, ACCT_WB); +CLEARPCGFLAG(AcctWB, ACCT_WB); +TESTPCGFLAG(AcctWB, ACCT_WB); + +SETPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +CLEARPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); +TESTPCGFLAG(AcctWBTemp, ACCT_WB_TEMP); + +SETPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +CLEARPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); +TESTPCGFLAG(AcctUnstableNFS, ACCT_UNSTABLE); + + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc-page); @@ -82,7 +104,9 @@ static inline enum zone_type page_cgroup { return page_zonenum(pc-page); } - +/* + * lock_page_cgroup() should not be held under mapping-tree_lock + */ static inline void lock_page_cgroup(struct page_cgroup *pc) { bit_spin_lock(PCG_LOCK, pc-flags); @@ -93,6 +117,24 @@ static inline void unlock_page_cgroup(st bit_spin_unlock(PCG_LOCK, pc-flags); } +/* + * Lock order is + * lock_page_cgroup() + * lock_page_cgroup_migrate() + * This lock is not be lock for charge/uncharge but for account moving. + * i.e. overwrite pc-mem_cgroup. The lock owner should guarantee by itself + * the page is uncharged while we hold this. + */ +static inline void lock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_lock(PCG_MIGRATE_LOCK, pc-flags); +} + +static inline void unlock_page_cgroup_migrate(struct page_cgroup *pc) +{ + bit_spin_unlock(PCG_MIGRATE_LOCK, pc-flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; Index: mmotm-2.6.33-Mar2/mm/memcontrol.c === --- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c +++ mmotm-2.6.33-Mar2/mm/memcontrol.c @@ -87,6 +87,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_STAT_DIRTY, + MEM_CGROUP_STAT_WBACK, + MEM_CGROUP_STAT_WBACK_TEMP, + MEM_CGROUP_STAT_UNSTABLE_NFS, MEM_CGROUP_STAT_NSTATS, }; @@ -1360,6 +1364,86 @@ done: } /* + * Update file cache's status for memcg. Before calling this, + * mapping-tree_lock should be held and preemption is disabled. + * Then, it's guarnteed that the page is not uncharged while we + * access page_cgroup. We can make use of that. + */ +void mem_cgroup_update_stat_locked(struct page *page, int idx, bool set) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + + pc = lookup_page_cgroup(page); + /* Not accounted ? */ + if (!PageCgroupUsed(pc)) + return; + lock_page_cgroup_migrate(pc); + /* +* It's guarnteed that this page is never uncharged. +* The only racy problem is moving account among memcgs. +*/ + switch (idx) { + case MEM_CGROUP_STAT_DIRTY: + if (set) + SetPageCgroupAcctDirty(pc); + else +
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 23:03:19 +0100 Andrea Righi ari...@develer.com wrote: On Wed, Mar 03, 2010 at 05:21:32PM +0900, KAMEZAWA Hiroyuki wrote: On Wed, 3 Mar 2010 15:15:49 +0900 KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: + preempt_disable(); + lock_page_cgroup_migrate(pc); page = pc-page; if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ - preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); - preempt_enable(); } mem_cgroup_charge_statistics(from, pc, false); + move_acct_information(from, to, pc); Kame-san, a question. According to is_target_pte_for_mc() it seems we don't move file pages across cgroups for now. yes. It's just in plan. If !PageAnon(page) we just return 0 and the page won't be selected for migration in mem_cgroup_move_charge_pte_range(). So, if I've understood well the code is correct in perspective, but right now it's unnecessary. File pages are not moved on task migration across cgroups and, at the moment, there's no way for file page accounted statistics to go negative. Or am I missing something? At rmdir(), remainging file caches in a cgroup is moved to its parent. Then, all file caches are moved to its parent at rmdir(). This behavior is for avoiding to lose too much file caches at removing cgroup. Thanks, -Kame ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, 2 Mar 2010 09:01:58 +0100, Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). FYI, I'm planning to add file-cache and shmem/tmpfs support for move_charge feature for 2.6.35. But, hmm, it would be complicated if we try to move dirty account too. Thanks, Daisuke Nishimura. ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, 2 Mar 2010 09:01:58 +0100 Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). My concern is - migration between memcg is already suppoted - at task move - at rmdir Then, if you leave DIRTY_PAGE accounting to original cgroup, the new cgroup (migration target)'s Dirty page accounting may goes to be negative, or incorrect value. Please check FILE_MAPPED implementation in __mem_cgroup_move_account() As if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } then, FILE_MAPPED never goes negative. Thanks, -Kame Thanks for your ack and the detailed review! -Andrea -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majord...@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c | 5 +++ fs/nfs/write.c | 4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c | 1 + mm/page-writeback.c | 84 -- mm/rmap.c | 4 +- mm/truncate.c | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, s/clone_page/page/ And #include linux/memcontrol.h is missed. + MEM_CGROUP_STAT_WRITEBACK, 1); inc_zone_page_state(clone_page, NR_WRITEBACK); + } unlock_page(clone_page); return 0; @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int err) } if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) + if (TestClearPageWriteback(page)) { + mem_cgroup_update_stat(clone_page, + MEM_CGROUP_STAT_WRITEBACK, -1); dec_zone_page_state(page, NR_WRITEBACK); + } } else end_page_writeback(page); } diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } diff --git
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 2, 2010 at 1:02 PM, Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 12:11:10PM +0200, Kirill A. Shutemov wrote: On Mon, Mar 1, 2010 at 11:23 PM, Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c | 5 +++ fs/nfs/write.c | 4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c | 1 + mm/page-writeback.c | 84 -- mm/rmap.c | 4 +- mm/truncate.c | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, s/clone_page/page/ mmh... shouldn't we use the same page used by TestSetPageWriteback() and inc_zone_page_state()? Sorry, I've commented wrong hunk. It's for the next one. And #include linux/memcontrol.h is missed. OK. I'll apply your fixes and post a new version. Thanks for reviewing, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
* Andrea Righi ari...@develer.com [2010-03-01 22:23:40]: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c |5 +++ fs/nfs/write.c |4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c|1 + mm/page-writeback.c | 84 -- mm/rmap.c |4 +- mm/truncate.c |2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c Don't need memcontrol.h to be included here? Looks OK to me overall, but there might be objection using the mem_cgroup_* naming convention, but I don't mind it very much :) -- Three Cheers, Balbir ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, 2010-03-01 at 22:23 +0100, Andrea Righi wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5a0f8f3..d83f41c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties; */ static int calc_period_shift(void) { - unsigned long dirty_total; + unsigned long dirty_total, dirty_bytes; - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; + dirty_bytes = mem_cgroup_dirty_bytes(); + if (dirty_bytes) So you don't think 0 is a valid max dirty amount? + dirty_total = dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / - 100; + dirty_total = (mem_cgroup_dirty_ratio() * + determine_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) */ unsigned long determine_dirtyable_memory(void) { - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + unsigned long memory; + s64 memcg_memory; + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ + memory -= highmem_dirtyable_memory(memory); + memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES); + if (memcg_memory 0) And here you somehow return negative? + return memory + 1; + return min((unsigned long)memcg_memory, memory + 1); } void @@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { unsigned long background; - unsigned long dirty; + unsigned long dirty, dirty_bytes, dirty_background; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + dirty_bytes = mem_cgroup_dirty_bytes(); + if (dirty_bytes) zero not valid again + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE); else { int dirty_ratio; - dirty_ratio = vm_dirty_ratio; + dirty_ratio = mem_cgroup_dirty_ratio(); if (dirty_ratio 5) dirty_ratio = 5; dirty = (dirty_ratio * available_memory) / 100; } - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + dirty_background = mem_cgroup_dirty_background_bytes(); + if (dirty_background) idem + background = DIV_ROUND_UP(dirty_background, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; - + background = (mem_cgroup_dirty_background_ratio() * + available_memory) / 100; if (background = dirty) background = dirty / 2; tsk = current; @@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space *mapping, get_dirty_limits(background_thresh, dirty_thresh, bdi_thresh, bdi); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); + nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK); + if ((nr_reclaimable 0) || (nr_writeback 0)) { + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); ??? why would a page_state be negative.. I see you return -ENOMEM on ! cgroup, but how can one specify no dirty limit with this compiled in? - nr_writeback = global_page_state(NR_WRITEBACK); + nr_writeback = global_page_state(NR_WRITEBACK); + } bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY); if (bdi_cap_account_unstable(bdi)) { @@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space *mapping, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ + nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); + if (nr_reclaimable 0) + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + +
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
* KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]: On Tue, 2 Mar 2010 09:01:58 +0100 Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). My concern is - migration between memcg is already suppoted - at task move - at rmdir Then, if you leave DIRTY_PAGE accounting to original cgroup, the new cgroup (migration target)'s Dirty page accounting may goes to be negative, or incorrect value. Please check FILE_MAPPED implementation in __mem_cgroup_move_account() As if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } then, FILE_MAPPED never goes negative. Absolutely! I am not sure how complex dirty memory migration will be, but one way of working around it would be to disable migration of charges when the feature is enabled (dirty* is set in the memory cgroup). We might need additional logic to allow that to happen. -- Three Cheers, Balbir ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 2, 2010 at 3:47 PM, Balbir Singh bal...@linux.vnet.ibm.com wrote: * Andrea Righi ari...@develer.com [2010-03-01 22:23:40]: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c | 5 +++ fs/nfs/write.c | 4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c | 1 + mm/page-writeback.c | 84 -- mm/rmap.c | 4 +- mm/truncate.c | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c Don't need memcontrol.h to be included here? It's included in linux/swap.h Looks OK to me overall, but there might be objection using the mem_cgroup_* naming convention, but I don't mind it very much :) -- Three Cheers, Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majord...@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) This will also be problematic as on 32bit systems, your uppper limit of dirty memory will be 2G? I guess, I will prefer one of the two. - return the error code from function and pass a pointer to store stats in as function argument. - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if per cgroup dirty control is enabled, then use per cgroup stats. In that case you don't have to return negative values. Only tricky part will be careful accouting so that none of the stats go negative in corner cases of migration etc. Thanks Vivek dirty = global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK); if (dirty = dirty_thresh) ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
* Peter Zijlstra pet...@infradead.org [2010-03-02 14:48:56]: This is ugly and broken.. I thought you'd agreed to something like: if (mem_cgroup_has_dirty_limit(cgroup)) use mem_cgroup numbers else use global numbers That allows for a 0 dirty limit (which should work and basically makes all io synchronous). Also, I'd put each of those in a separate function, like: unsigned long reclaimable_pages(cgroup) { if (mem_cgroup_has_dirty_limit(cgroup)) return mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); return global_page_state(NR_FILE_DIRTY) + global_page_state(NR_NFS_UNSTABLE); } I agree, I should have been more specific about the naming convention, this is what I meant - along these lines as we do with zone_nr_lru_pages(), etc. Which raises another question, you should probably rebase on top of Trond's patches, which removes BDI_RECLAIMABLE, suggesting you also loose MEMCG_NR_RECLAIM_PAGES in favour of the DIRTY+UNSTABLE split. -- Three Cheers, Balbir ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, 2010-03-02 at 14:48 +0100, Peter Zijlstra wrote: unsigned long reclaimable_pages(cgroup) { if (mem_cgroup_has_dirty_limit(cgroup)) return mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES); return global_page_state(NR_FILE_DIRTY) + global_page_state(NR_NFS_UNSTABLE); } Which raises another question, you should probably rebase on top of Trond's patches, which removes BDI_RECLAIMABLE, suggesting you also loose MEMCG_NR_RECLAIM_PAGES in favour of the DIRTY+UNSTABLE split. I'm dropping those patches for now. The main writeback change wasn't too favourably received by the linux-mm community so I've implemented an alternative that only changes the NFS layer, and doesn't depend on the DIRTY+UNSTABLE split. Cheers Trond ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, 2 Mar 2010 23:18:23 +0100, Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 07:20:26PM +0530, Balbir Singh wrote: * KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com [2010-03-02 17:23:16]: On Tue, 2 Mar 2010 09:01:58 +0100 Andrea Righi ari...@develer.com wrote: On Tue, Mar 02, 2010 at 09:23:09AM +0900, KAMEZAWA Hiroyuki wrote: On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Correct. This was actually the last item of the TODO list. Anyway, I'm still considering if it's correct to move dirty pages when a task is migrated from a cgroup to another. Currently, dirty pages just remain in the original cgroup and are flushed depending on the original cgroup settings. That is not totally wrong... at least moving the dirty pages between memcgs should be optional (move_charge_at_immigrate?). My concern is - migration between memcg is already suppoted - at task move - at rmdir Then, if you leave DIRTY_PAGE accounting to original cgroup, the new cgroup (migration target)'s Dirty page accounting may goes to be negative, or incorrect value. Please check FILE_MAPPED implementation in __mem_cgroup_move_account() As if (page_mapped(page) !PageAnon(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to-stat-count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } then, FILE_MAPPED never goes negative. Absolutely! I am not sure how complex dirty memory migration will be, but one way of working around it would be to disable migration of charges when the feature is enabled (dirty* is set in the memory cgroup). We might need additional logic to allow that to happen. I've started to look at dirty memory migration. First attempt is to add DIRTY, WRITEBACK, etc. to page_cgroup flags and handle them in __mem_cgroup_move_account(). Probably I'll have something ready for the next version of the patch. I still need to figure if this can work as expected... I agree it's a right direction(in fact, I have been planning to post a patch in that direction), so I leave it to you. Can you add PCG_FILE_MAPPED flag too ? I think this flag can be handled in the same way as other flags you're trying to add, and we can change if (page_mapped(page) !PageAnon(page)) to if (PageCgroupFileMapped(pc) in __mem_cgroup_move_account(). It would be cleaner than current code, IMHO. Thanks, Daisuke Nishimura. ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Tue, Mar 02, 2010 at 11:22:48PM +0100, Andrea Righi wrote: On Tue, Mar 02, 2010 at 10:05:29AM -0500, Vivek Goyal wrote: On Mon, Mar 01, 2010 at 11:18:31PM +0100, Andrea Righi wrote: On Mon, Mar 01, 2010 at 05:02:08PM -0500, Vivek Goyal wrote: @@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wh... */ -if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) = dirty_thresh) - break; -congestion_wait(BLK_RW_ASYNC, HZ/10); + + dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); + if (dirty 0) + dirty = global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK); dirty is unsigned long. As mentioned last time, above will never be true? In general these patches look ok to me. I will do some testing with these. Re-introduced the same bug. My bad. :( The value returned from mem_cgroup_page_stat() can be negative, i.e. when memory cgroup is disabled. We could simply use a long for dirty, the unit is in # of pages so s64 should be enough. Or cast dirty to long only for the check (see below). Thanks! -Andrea Signed-off-by: Andrea Righi ari...@develer.com --- mm/page-writeback.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d83f41c..dbee976 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -701,7 +701,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES); - if (dirty 0) + if ((long)dirty 0) This will also be problematic as on 32bit systems, your uppper limit of dirty memory will be 2G? I guess, I will prefer one of the two. - return the error code from function and pass a pointer to store stats in as function argument. - Or Peter's suggestion of checking mem_cgroup_has_dirty_limit() and if per cgroup dirty control is enabled, then use per cgroup stats. In that case you don't have to return negative values. Only tricky part will be careful accouting so that none of the stats go negative in corner cases of migration etc. What do you think about Peter's suggestion + the locking stuff? (see the previous email). Otherwise, I'll choose the other solution, passing a pointer and always return the error code is not bad. Ok, so you are worried about that by the we finish mem_cgroup_has_dirty_limit() call, task might change cgroup and later we might call mem_cgroup_get_page_stat() on a different cgroup altogether which might or might not have dirty limits specified? But in what cases you don't want to use memory cgroup specified limit? I thought cgroup disabled what the only case where we need to use global limits. Otherwise a memory cgroup will have either dirty_bytes specified or by default inherit global dirty_ratio which is a valid number. If that's the case then you don't have to take rcu_lock() outside get_page_stat()? IOW, apart from cgroup being disabled, what are the other cases where you expect to not use cgroup's page stat and use global stats? Thanks Vivek Thanks, -Andrea ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } (snip) @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); task_dirty_inc(current); As long as I can see, those two functions(at least) calls mem_cgroup_update_state(), which acquires page cgroup lock, under mapping-tree_lock. But as I fixed before in commit e767e056, page cgroup lock must not acquired under mapping-tree_lock. hmm, we should call those mem_cgroup_update_state() outside mapping-tree_lock, or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock. Thanks, Daisuke Nishimura. On Mon, 1 Mar 2010 22:23:40 +0100, Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c |5 +++ fs/nfs/write.c |4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c|1 + mm/page-writeback.c | 84 -- mm/rmap.c |4 +- mm/truncate.c |2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) {
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 11:12:38 +0900 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote: diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } (snip) @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); task_dirty_inc(current); As long as I can see, those two functions(at least) calls mem_cgroup_update_state(), which acquires page cgroup lock, under mapping-tree_lock. But as I fixed before in commit e767e056, page cgroup lock must not acquired under mapping-tree_lock. hmm, we should call those mem_cgroup_update_state() outside mapping-tree_lock, or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock. Ah, good catch! But hmm... This account_page_dirtted() seems to be called under IRQ-disabled. About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag, then, mem_cgroup_uncharge_page() can handle it automatically. But. there are no guarantee that following never happens. lock_page_cgroup() === interrupt. - mapping-tree_lock() Even if mapping-tree_lock is held with IRQ-disabled. Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup(). Then, hm...some kind of new trick ? as.. (Follwoing patch is not tested!!) == --- include/linux/page_cgroup.h | 14 ++ mm/memcontrol.c | 27 +-- 2 files changed, 31 insertions(+), 10 deletions(-) Index: mmotm-2.6.33-Feb11/include/linux/page_cgroup.h === --- mmotm-2.6.33-Feb11.orig/include/linux/page_cgroup.h +++ mmotm-2.6.33-Feb11/include/linux/page_cgroup.h @@ -39,6 +39,7 @@ enum { PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATE, /* page cgroup is under memcg account migration */ }; #define TESTPCGFLAG(uname, lname) \ @@ -73,6 +74,8 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU) TESTPCGFLAG(AcctLRU, ACCT_LRU) TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) +TESTPCGFLAG(Migrate, MIGRATE) + static inline int page_cgroup_nid(struct page_cgroup *pc) { return page_to_nid(pc-page); @@ -93,6 +96,17 @@ static inline void unlock_page_cgroup(st bit_spin_unlock(PCG_LOCK, pc-flags); } +static inline unsigned long page_cgroup_migration_lock(struct page_cgroup *pc) +{ + local_irq_save(flags); + bit_spin_lock(PCG_MIGRATE, pc-flags); +} +static inline void +page_cgroup_migration_lock(struct page_cgroup *pc, unsigned long flags) +{ + bit_spin_lock(PCG_MIGRATE, pc-flags); + local_irq_restore(flags); +} #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; Index: mmotm-2.6.33-Feb11/mm/memcontrol.c === --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c +++ mmotm-2.6.33-Feb11/mm/memcontrol.c @@ -1321,7 +1321,7 @@ bool mem_cgroup_handle_oom(struct mem_cg * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. */ -void mem_cgroup_update_file_mapped(struct page *page, int val) +void mem_cgroup_update_file_mapped(struct page *page, int val, int locked) { struct mem_cgroup *mem; struct page_cgroup *pc; @@ -1329,22 +1329,27 @@ void mem_cgroup_update_file_mapped(struc pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - - lock_page_cgroup(pc); + /* +* if locked==1, mapping-tree_lock is held. We don't have to take +* care of charge/uncharge. just think about migration. +*/ + if (!locked) + lock_page_cgroup(pc); + else + page_cgroup_migration_lock(pc); mem = pc-mem_cgroup; - if (!mem) + if (!mem || !PageCgroupUsed(pc)) goto done; - - if (!PageCgroupUsed(pc)) - goto done; - /* * Preemption is already disabled. We can use __this_cpu_xxx */ __this_cpu_add(mem-stat-count[MEM_CGROUP_STAT_FILE_MAPPED], val); done: -
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: On Wed, 3 Mar 2010 11:12:38 +0900 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote: diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } (snip) @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); task_dirty_inc(current); As long as I can see, those two functions(at least) calls mem_cgroup_update_state(), which acquires page cgroup lock, under mapping-tree_lock. But as I fixed before in commit e767e056, page cgroup lock must not acquired under mapping-tree_lock. hmm, we should call those mem_cgroup_update_state() outside mapping-tree_lock, or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock. Ah, good catch! But hmm... This account_page_dirtted() seems to be called under IRQ-disabled. About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag, then, mem_cgroup_uncharge_page() can handle it automatically. But. there are no guarantee that following never happens. lock_page_cgroup() === interrupt. - mapping-tree_lock() Even if mapping-tree_lock is held with IRQ-disabled. Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup(). Then, hm...some kind of new trick ? as.. (Follwoing patch is not tested!!) If we can verify that all callers of mem_cgroup_update_stat() have always either aquired or not aquired tree_lock, this direction will work fine. But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below. === include/linux/page_cgroup.h |8 ++-- mm/memcontrol.c | 43 +-- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 30b0813..51da916 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -83,15 +83,19 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) return page_zonenum(pc-page); } -static inline void lock_page_cgroup(struct page_cgroup *pc) +static inline void __lock_page_cgroup(struct page_cgroup *pc) { bit_spin_lock(PCG_LOCK, pc-flags); } +#define lock_page_cgroup(pc, flags) \ + do { local_irq_save(flags); __lock_page_cgroup(pc); } while (0) -static inline void unlock_page_cgroup(struct page_cgroup *pc) +static inline void __unlock_page_cgroup(struct page_cgroup *pc) { bit_spin_unlock(PCG_LOCK, pc-flags); } +#define unlock_page_cgroup(pc, flags) \ + do { __unlock_page_cgroup(pc); local_irq_restore(flags); } while (0) #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00ed4b1..40b9be4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1327,12 +1327,13 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) { struct mem_cgroup *mem; struct page_cgroup *pc; + unsigned long flags; pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - lock_page_cgroup(pc); + lock_page_cgroup(pc, flags); mem = pc-mem_cgroup; if (!mem) goto done; @@ -1346,7 +1347,7 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) __this_cpu_add(mem-stat-count[MEM_CGROUP_STAT_FILE_MAPPED], val); done: - unlock_page_cgroup(pc); + unlock_page_cgroup(pc, flags); } /* @@ -1680,11 +1681,12 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) struct page_cgroup *pc; unsigned short id; swp_entry_t ent; + unsigned long flags; VM_BUG_ON(!PageLocked(page)); pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); + lock_page_cgroup(pc, flags); if (PageCgroupUsed(pc)) { mem = pc-mem_cgroup; if (mem !css_tryget(mem-css)) @@ -1698,7 +1700,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) mem = NULL; rcu_read_unlock(); } -
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Wed, 3 Mar 2010 15:01:37 +0900 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote: On Wed, 3 Mar 2010 12:29:06 +0900, KAMEZAWA Hiroyuki kamezawa.hir...@jp.fujitsu.com wrote: On Wed, 3 Mar 2010 11:12:38 +0900 Daisuke Nishimura nishim...@mxp.nes.nec.co.jp wrote: diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } (snip) @@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); task_dirty_inc(current); As long as I can see, those two functions(at least) calls mem_cgroup_update_state(), which acquires page cgroup lock, under mapping-tree_lock. But as I fixed before in commit e767e056, page cgroup lock must not acquired under mapping-tree_lock. hmm, we should call those mem_cgroup_update_state() outside mapping-tree_lock, or add local_irq_save/restore() around lock/unlock_page_cgroup() to avoid dead-lock. Ah, good catch! But hmm... This account_page_dirtted() seems to be called under IRQ-disabled. About __remove_from_page_cache(), I think page_cgroup should have its own DIRTY flag, then, mem_cgroup_uncharge_page() can handle it automatically. But. there are no guarantee that following never happens. lock_page_cgroup() === interrupt. - mapping-tree_lock() Even if mapping-tree_lock is held with IRQ-disabled. Then, if we add local_irq_save(), we have to add it to all lock_page_cgroup(). Then, hm...some kind of new trick ? as.. (Follwoing patch is not tested!!) If we can verify that all callers of mem_cgroup_update_stat() have always either aquired or not aquired tree_lock, this direction will work fine. But if we can't, we have to add local_irq_save() to lock_page_cgroup() like below. Agreed. Let's try how we can write a code in clean way. (we have time ;) For now, to me, IRQ disabling while lock_page_cgroup() seems to be a little over killing. What I really want is lockless code...but it seems impossible under current implementation. I wonder the fact the page is never unchareged under us can give us some chances ...Hmm. Thanks, -Kame ___ Containers mailing list contain...@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/containers ___ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, Mar 01, 2010 at 10:23:40PM +0100, Andrea Righi wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com --- fs/fuse/file.c |5 +++ fs/nfs/write.c |4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c|1 + mm/page-writeback.c | 84 -- mm/rmap.c |4 +- mm/truncate.c |2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, + MEM_CGROUP_STAT_WRITEBACK, 1); inc_zone_page_state(clone_page, NR_WRITEBACK); + } unlock_page(clone_page); return 0; @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int err) } if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) + if (TestClearPageWriteback(page)) { + mem_cgroup_update_stat(clone_page, + MEM_CGROUP_STAT_WRITEBACK, -1); dec_zone_page_state(page, NR_WRITEBACK); + } } else end_page_writeback(page); } diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5a0f8f3..d83f41c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -137,13
[Devel] Re: [PATCH -mmotm 3/3] memcg: dirty pages instrumentation
On Mon, 1 Mar 2010 22:23:40 +0100 Andrea Righi ari...@develer.com wrote: Apply the cgroup dirty pages accounting and limiting infrastructure to the opportune kernel functions. Signed-off-by: Andrea Righi ari...@develer.com Seems nice. Hmm. the last problem is moving account between memcg. Right ? Thanks, -Kame --- fs/fuse/file.c |5 +++ fs/nfs/write.c |4 ++ fs/nilfs2/segment.c | 10 +- mm/filemap.c|1 + mm/page-writeback.c | 84 -- mm/rmap.c |4 +- mm/truncate.c |2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..dbbdd53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -11,6 +11,7 @@ #include linux/pagemap.h #include linux/slab.h #include linux/kernel.h +#include linux/memcontrol.h #include linux/sched.h #include linux/module.h @@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(req-writepages_entry); dec_bdi_stat(bdi, BDI_WRITEBACK); + mem_cgroup_update_stat(req-pages[0], + MEM_CGROUP_STAT_WRITEBACK_TEMP, -1); dec_zone_page_state(req-pages[0], NR_WRITEBACK_TEMP); bdi_writeout_inc(bdi); wake_up(fi-page_waitq); @@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page) req-inode = inode; inc_bdi_stat(mapping-backing_dev_info, BDI_WRITEBACK); + mem_cgroup_update_stat(tmp_page, + MEM_CGROUP_STAT_WRITEBACK_TEMP, 1); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); end_page_writeback(page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b753242..7316f7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req) req-wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(inode-i_lock); + mem_cgroup_update_stat(req-wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1); inc_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req-wb_page; if (test_and_clear_bit(PG_CLEAN, (req)-wb_flags)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page-mapping-backing_dev_info, BDI_UNSTABLE); return 1; @@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head-next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_update_stat(req-wb_page, + MEM_CGROUP_STAT_UNSTABLE_NFS, -1); dec_zone_page_state(req-wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req-wb_page-mapping-backing_dev_info, BDI_UNSTABLE); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b..aef6d13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) } while (bh = bh-b_this_page, bh2 = bh2-b_this_page, bh != head); kunmap_atomic(kaddr, KM_USER0); - if (!TestSetPageWriteback(clone_page)) + if (!TestSetPageWriteback(clone_page)) { + mem_cgroup_update_stat(clone_page, + MEM_CGROUP_STAT_WRITEBACK, 1); inc_zone_page_state(clone_page, NR_WRITEBACK); + } unlock_page(clone_page); return 0; @@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int err) } if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) + if (TestClearPageWriteback(page)) { + mem_cgroup_update_stat(clone_page, + MEM_CGROUP_STAT_WRITEBACK, -1); dec_zone_page_state(page, NR_WRITEBACK); + } } else end_page_writeback(page); } diff --git a/mm/filemap.c b/mm/filemap.c index fe09e51..f85acae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) mapping_cap_account_dirty(mapping)) { + mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping-backing_dev_info, BDI_DIRTY); } diff --git a/mm/page-writeback.c