Collect dirty file and writeback pages to be used by memory cgroup dirty_ratio to evaluate dirty limits.
Moreover, the following entries are added to the file memory.stat of each cgroup to export these statistics to userspace: - filedirty (number of dirty file pages) - writeback (number of pages under writeback) [ Note: currently only non-anonymous pages are accounted in writeback pages; for swapped-out pages with swap_writepage() it's not possible to retrieve the memory cgroup they belong to, so there would be a leak in writeback statistics if we also try to account these pages without changing too much stuff. ] Signed-off-by: Andrea Righi <[EMAIL PROTECTED]> --- fs/buffer.c | 2 + fs/nfs/write.c | 4 + fs/nilfs2/page.h | 9 ++- fs/reiser4/as_ops.c | 5 +- fs/reiser4/page_cache.c | 5 +- include/linux/memcontrol.h | 76 ++++++++++++++++++ mm/filemap.c | 2 + mm/memcontrol.c | 187 ++++++++++++++++++++++++++++++++++++++++---- mm/page-writeback.c | 13 +++- mm/truncate.c | 2 + 10 files changed, 284 insertions(+), 21 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8274f5e..fc45593 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -31,6 +31,7 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <linux/hash.h> #include <linux/suspend.h> #include <linux/buffer_head.h> @@ -718,6 +719,7 @@ static int __set_page_dirty(struct page *page, WARN_ON_ONCE(warn && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e21..cd95d3f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -12,6 +12,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <linux/swap.h> #include <linux/sunrpc/clnt.h> @@ -410,6 +411,7 @@ nfs_mark_request_commit(struct nfs_page *req) req->wb_index, NFS_PAGE_TAG_COMMIT); spin_unlock(&inode->i_lock); + mem_cgroup_charge_file_dirty(req->wb_page, 1); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -421,6 +423,7 @@ nfs_clear_request_commit(struct nfs_page *req) struct page *page = req->wb_page; if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { + mem_cgroup_charge_file_dirty(page, -1); dec_zone_page_state(page, NR_UNSTABLE_NFS); dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); return 1; @@ -1263,6 +1266,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_commit(req); + mem_cgroup_charge_file_dirty(page, -1); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index b77c91c..a88f84c 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -24,6 +24,7 @@ #ifndef _NILFS_PAGE_H #define _NILFS_PAGE_H +#include <linux/memcontrol.h> #include "nilfs.h" extern struct buffer_head *nilfs_get_page_block(struct page *, unsigned long, @@ -68,8 +69,10 @@ nilfs_page_get_nth_block(struct page *page, unsigned int count) static inline void nilfs_set_page_writeback(struct page *page) { if (buffer_nilfs_allocated(page_buffers(page))) { - if (!TestSetPageWriteback(page)) + if (!TestSetPageWriteback(page)) { + mem_cgroup_charge_writeback(page, 1); inc_zone_page_state(page, NR_WRITEBACK); + } } else set_page_writeback(page); } @@ -77,8 +80,10 @@ static inline void nilfs_set_page_writeback(struct page *page) static inline void nilfs_end_page_writeback(struct page *page) { if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) + if (TestClearPageWriteback(page)) { + mem_cgroup_charge_writeback(page, -1); dec_zone_page_state(page, NR_WRITEBACK); + } } else end_page_writeback(page); } diff --git a/fs/reiser4/as_ops.c b/fs/reiser4/as_ops.c index decb9eb..1e144ec 100644 --- a/fs/reiser4/as_ops.c +++ b/fs/reiser4/as_ops.c @@ -40,6 +40,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <linux/backing-dev.h> #include <linux/quotaops.h> #include <linux/security.h> @@ -82,9 +83,11 @@ int reiser4_set_page_dirty(struct page *page) /* check for race with truncate */ if (page->mapping) { assert("vs-1652", page->mapping == mapping); - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, 1); inc_zone_page_state(page, NR_FILE_DIRTY); + } radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_REISER4_MOVED); diff --git a/fs/reiser4/page_cache.c b/fs/reiser4/page_cache.c index 654e7ae..7dadb9b 100644 --- a/fs/reiser4/page_cache.c +++ b/fs/reiser4/page_cache.c @@ -202,6 +202,7 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <linux/blkdev.h> static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp); @@ -467,8 +468,10 @@ int reiser4_set_page_dirty_internal(struct page *page) BUG_ON(mapping == NULL); if (!TestSetPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, 1); inc_zone_page_state(page, NR_FILE_DIRTY); + } __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ee1b2fc..c3a1f19 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -20,15 +20,48 @@ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H +#include <linux/cgroup.h> + struct mem_cgroup; struct page_cgroup; struct page; struct mm_struct; +extern int vm_dirty_ratio; + +/* + * Statistics for memory cgroup. + */ +enum mem_cgroup_stat_index { + /* + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. + */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ + MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */ + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ + MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ + MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + + MEM_CGROUP_STAT_NSTATS, +}; #ifdef CONFIG_CGROUP_MEM_RES_CTLR #define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) +struct mem_cgroup *get_current_mem_cgroup(void); +extern void put_mem_cgroup(struct mem_cgroup *mem); + +extern unsigned long mem_cgroup_global_lru_pages(struct mem_cgroup *mem); +extern unsigned long mem_cgroup_get_free_pages(struct mem_cgroup *mem); +extern long mem_cgroup_dirty_ratio(struct mem_cgroup *mem); + +extern void mem_cgroup_charge_file_dirty(struct page *page, int charge); +extern s64 mem_cgroup_nr_file_dirty(struct mem_cgroup *mem); + +extern void mem_cgroup_charge_writeback(struct page *page, int charge); +extern s64 mem_cgroup_nr_writeback(struct mem_cgroup *mem); + extern struct page_cgroup *page_get_page_cgroup(struct page *page); extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); @@ -132,6 +165,49 @@ static inline void mem_cgroup_end_migration(struct page *page) { } +static inline struct mem_cgroup *get_current_mem_cgroup(void) +{ + return NULL; +} + +static inline void put_mem_cgroup(struct mem_cgroup *mem) +{ +} + +static inline unsigned long mem_cgroup_global_lru_pages(struct mem_cgroup *mem) +{ + return 0; +} + +static inline unsigned long mem_cgroup_get_free_pages(struct mem_cgroup *mem) +{ + return 0; +} + +static inline long mem_cgroup_dirty_ratio(struct mem_cgroup *mem) +{ + return vm_dirty_ratio; +} + +static inline void mem_cgroup_charge_file_dirty(struct page *page, int charge) +{ +} + +static inline void mem_cgroup_charge_writeback(struct page *page, int charge) +{ +} + +static inline s64 mem_cgroup_nr_file_dirty(struct mem_cgroup *mem) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); +} + +static inline s64 mem_cgroup_nr_writeback(struct mem_cgroup *mem) +{ + return global_page_state(NR_WRITEBACK); +} + static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 0df6e1f..54f8689 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,6 +25,7 @@ #include <linux/uio.h> #include <linux/hash.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/blkdev.h> @@ -131,6 +132,7 @@ void __remove_from_page_cache(struct page *page) * having removed the page entirely. */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2979d22..6de911e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -40,21 +40,6 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; static struct kmem_cache *page_cgroup_cache __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 -/* - * Statistics for memory cgroup. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ - - MEM_CGROUP_STAT_NSTATS, -}; - struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS]; } ____cacheline_aligned_in_smp; @@ -73,6 +58,14 @@ static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, stat->cpustat[cpu].count[idx] += val; } +static void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat, + enum mem_cgroup_stat_index idx, int val) +{ + int cpu = get_cpu(); + stat->cpustat[cpu].count[idx] += val; + put_cpu(); +} + static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, enum mem_cgroup_stat_index idx) { @@ -133,6 +126,9 @@ struct mem_cgroup { * statistics. */ struct mem_cgroup_stat stat; + + /* per memory cgroup dirty_ratio */ + long dirty_ratio; }; static struct mem_cgroup init_mem_cgroup; @@ -358,6 +354,141 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + if (pc) { + mem = pc->mem_cgroup; + css_get(&mem->css); + } + unlock_page_cgroup(page); + return mem; +} + +struct mem_cgroup *get_current_mem_cgroup(void) +{ + struct mem_cgroup *mem; + + rcu_read_lock(); + mem = mem_cgroup_from_task(current); + if (likely(mem)) + css_get(&mem->css); + rcu_read_unlock(); + + return mem; +} + +void put_mem_cgroup(struct mem_cgroup *mem) +{ + css_put(&mem->css); +} + +static void mem_cgroup_charge_stat(struct page *page, + enum mem_cgroup_stat_index idx, int charge) +{ + struct mem_cgroup *mem; + + mem = get_mem_cgroup_from_page(page); + VM_BUG_ON(!mem); + __mem_cgroup_stat_add(&mem->stat, idx, charge); + css_put(&mem->css); +} + +void mem_cgroup_charge_file_dirty(struct page *page, int charge) +{ + mem_cgroup_charge_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, charge); +} + +void mem_cgroup_charge_writeback(struct page *page, int charge) +{ + mem_cgroup_charge_stat(page, MEM_CGROUP_STAT_WRITEBACK, charge); +} + +s64 mem_cgroup_nr_file_dirty(struct mem_cgroup *mem) +{ + s64 ret; + + if (mem == NULL) { + mem = get_current_mem_cgroup(); + if (unlikely(!mem)) + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + } else + css_get(&mem->css); + ret = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_DIRTY); + css_put(&mem->css); + return ret; +} + +s64 mem_cgroup_nr_writeback(struct mem_cgroup *mem) +{ + s64 ret; + + if (mem == NULL) { + mem = get_current_mem_cgroup(); + if (unlikely(!mem)) + return global_page_state(NR_WRITEBACK); + } else + css_get(&mem->css); + ret = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_WRITEBACK); + css_put(&mem->css); + return ret; +} + +unsigned long mem_cgroup_get_free_pages(struct mem_cgroup *mem) +{ + long ret; + + if (mem == NULL) { + mem = get_current_mem_cgroup(); + if (unlikely(!mem)) + return 0; + } else + css_get(&mem->css); + ret = ((res_counter_read_u64(&mem->res, RES_LIMIT) + - res_counter_read_u64(&mem->res, RES_USAGE)) + >> PAGE_SHIFT) + 1; /* Ensure that we never return 0 */ + css_put(&mem->css); + return ret; +} + +unsigned long mem_cgroup_global_lru_pages(struct mem_cgroup *mem) +{ + long ret; + + if (mem == NULL) { + mem = get_current_mem_cgroup(); + if (unlikely(!mem)) + return 0; + } else + css_get(&mem->css); + ret = mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE_ANON) + + mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE_FILE) + + mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE_FILE) + + mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE_FILE); + css_put(&mem->css); + return ret; +} + +long mem_cgroup_dirty_ratio(struct mem_cgroup *mem) +{ + long ret; + + if (mem == NULL) { + mem = get_current_mem_cgroup(); + if (unlikely(!mem)) + return vm_dirty_ratio; + } else + css_get(&mem->css); + ret = mem->dirty_ratio; + css_put(&mem->css); + return ret; +} + /* * This routine assumes that the appropriate zone's lru lock is already held */ @@ -953,12 +1084,32 @@ static int mem_force_empty_write(struct cgroup *cont, unsigned int event) return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); } +static s64 mem_cgroup_dirty_ratio_read(struct cgroup *cont, struct cftype *cft) +{ + return mem_cgroup_from_cont(cont)->dirty_ratio; +} + +static int mem_cgroup_dirty_ratio_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + long val; + int ret; + + ret = strict_strtol(buffer, 10, &val); + if (!ret) + mem->dirty_ratio = val; + return ret; +} + static const struct mem_cgroup_stat_desc { const char *msg; u64 unit; } mem_cgroup_stat_desc[] = { [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, + [MEM_CGROUP_STAT_FILE_DIRTY] = { "filedirty", 1, }, + [MEM_CGROUP_STAT_WRITEBACK] = { "writeback", 1, }, [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, }; @@ -1023,6 +1174,11 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read, }, { + .name = "dirty_ratio", + .write_string = mem_cgroup_dirty_ratio_write, + .read_s64 = mem_cgroup_dirty_ratio_read, + }, + { .name = "failcnt", .private = RES_FAILCNT, .trigger = mem_cgroup_reset, @@ -1114,6 +1270,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } res_counter_init(&mem->res); + mem->dirty_ratio = vm_dirty_ratio; for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c6d6088..17c6141 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -25,6 +25,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> +#include <linux/memcontrol.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/notifier.h> @@ -1090,6 +1091,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, 1); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -1234,6 +1236,7 @@ int clear_page_dirty_for_io(struct page *page) * for more comments. */ if (TestClearPageDirty(page)) { + mem_cgroup_charge_file_dirty(page, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -1269,8 +1272,11 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { + if (!PageAnon(page)) + mem_cgroup_charge_writeback(page, -1); dec_zone_page_state(page, NR_WRITEBACK); + } return ret; } @@ -1300,8 +1306,11 @@ int test_set_page_writeback(struct page *page) } else { ret = TestSetPageWriteback(page); } - if (!ret) + if (!ret) { + if (!PageAnon(page)) + mem_cgroup_charge_writeback(page, 1); inc_zone_page_state(page, NR_WRITEBACK); + } return ret; } diff --git a/mm/truncate.c b/mm/truncate.c index e2bdd70..f47bd19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/memcontrol.h> #include <linux/swap.h> #include <linux/module.h> #include <linux/pagemap.h> @@ -73,6 +74,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) if (TestClearPageDirty(page)) { struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { + mem_cgroup_charge_file_dirty(page, -1); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); -- 1.5.4.3 _______________________________________________ Containers mailing list [EMAIL PROTECTED] https://lists.linux-foundation.org/mailman/listinfo/containers _______________________________________________ Devel mailing list Devel@openvz.org https://openvz.org/mailman/listinfo/devel