Add the accounting hooks. The accounting is carried out for RSS and Page Cache (unmapped) pages. There is now a common limit and accounting for both. The RSS accounting is accounted at page_add_*_rmap() and page_remove_rmap() time. Page cache is accounted at add_to_page_cache(), __delete_from_page_cache(). Swap cache is also accounted for.
Each page's meta_page is protected with a bit in page flags, this makes handling of race conditions involving simultaneous mappings of a page easier. A reference count is kept in the meta_page to deal with cases where a page might be unmapped from the RSS of all tasks, but still lives in the page cache. Signed-off-by: Balbir Singh <[EMAIL PROTECTED]> --- fs/exec.c | 1 include/linux/memcontrol.h | 11 +++ include/linux/page-flags.h | 3 + mm/filemap.c | 8 ++ mm/memcontrol.c | 132 ++++++++++++++++++++++++++++++++++++++++++++- mm/memory.c | 22 +++++++ mm/migrate.c | 6 ++ mm/page_alloc.c | 3 + mm/rmap.c | 2 mm/swap_state.c | 8 ++ mm/swapfile.c | 40 +++++++------ 11 files changed, 218 insertions(+), 18 deletions(-) diff -puN fs/exec.c~mem-control-accounting fs/exec.c --- linux-2.6.22-rc6/fs/exec.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/fs/exec.c 2007-07-05 13:45:18.000000000 -0700 @@ -51,6 +51,7 @@ #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/signalfd.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff -puN include/linux/memcontrol.h~mem-control-accounting include/linux/memcontrol.h --- linux-2.6.22-rc6/include/linux/memcontrol.h~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/memcontrol.h 2007-07-05 18:27:26.000000000 -0700 @@ -24,6 +24,8 @@ extern void mm_init_container(struct mm_ extern void mm_free_container(struct mm_struct *mm); extern void page_assign_meta_page(struct page *page, struct meta_page *mp); extern struct meta_page *page_get_meta_page(struct page *page); +extern int mem_container_charge(struct page *page, struct mm_struct *mm); +extern void mem_container_uncharge(struct meta_page *mp); #else /* CONFIG_CONTAINER_MEM_CONT */ static inline void mm_init_container(struct mm_struct *mm, @@ -45,6 +47,15 @@ static inline struct meta_page *page_get return NULL; } +static inline int mem_container_charge(struct page *page, struct mm_struct *mm) +{ + return 0; +} + +static inline void mem_container_uncharge(struct meta_page *mp) +{ +} + #endif /* CONFIG_CONTAINER_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ diff -puN include/linux/page-flags.h~mem-control-accounting include/linux/page-flags.h --- linux-2.6.22-rc6/include/linux/page-flags.h~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/include/linux/page-flags.h 2007-07-05 13:45:18.000000000 -0700 @@ -98,6 +98,9 @@ #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ #define PG_pinned PG_owner_priv_1 /* Xen pinned pagetable */ +#define PG_metapage 21 /* Used for checking if a meta_page */ + /* is associated with a page */ + #if (BITS_PER_LONG > 32) /* * 64-bit-only flags build down from bit 31 diff -puN mm/filemap.c~mem-control-accounting mm/filemap.c --- linux-2.6.22-rc6/mm/filemap.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/filemap.c 2007-07-05 18:26:29.000000000 -0700 @@ -31,6 +31,7 @@ #include <linux/syscalls.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ +#include <linux/memcontrol.h> #include "internal.h" /* @@ -116,6 +117,7 @@ void __remove_from_page_cache(struct pag { struct address_space *mapping = page->mapping; + mem_container_uncharge(page_get_meta_page(page)); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -442,6 +444,11 @@ int add_to_page_cache(struct page *page, int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + + error = mem_container_charge(page, current->mm); + if (error) + goto out; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { @@ -455,6 +462,7 @@ int add_to_page_cache(struct page *page, write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } +out: return error; } EXPORT_SYMBOL(add_to_page_cache); diff -puN mm/memcontrol.c~mem-control-accounting mm/memcontrol.c --- linux-2.6.22-rc6/mm/memcontrol.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/memcontrol.c 2007-07-05 18:27:29.000000000 -0700 @@ -16,6 +16,9 @@ #include <linux/memcontrol.h> #include <linux/container.h> #include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/bit_spinlock.h> +#include <linux/rcupdate.h> struct container_subsys mem_container_subsys; @@ -26,7 +29,9 @@ struct container_subsys mem_container_su * to help the administrator determine what knobs to tune. * * TODO: Add a water mark for the memory controller. Reclaim will begin when - * we hit the water mark. + * we hit the water mark. May be even add a low water mark, such that + * no reclaim occurs from a container at it's low water mark, this is + * a feature that will be implemented much later in the future. */ struct mem_container { struct container_subsys_state css; @@ -51,6 +56,7 @@ struct meta_page { struct list_head list; /* per container LRU list */ struct page *page; struct mem_container *mem_container; + atomic_t ref_cnt; }; @@ -87,6 +93,128 @@ struct meta_page *page_get_meta_page(str return page->meta_page; } +void __always_inline lock_meta_page(struct page *page) +{ + bit_spin_lock(PG_metapage, &page->flags); +} + +void __always_inline unlock_meta_page(struct page *page) +{ + bit_spin_unlock(PG_metapage, &page->flags); +} + +/* + * Charge the memory controller for page usage. + * Return + * 0 if the charge was successful + * < 0 if the container is over its limit + */ +int mem_container_charge(struct page *page, struct mm_struct *mm) +{ + struct mem_container *mem; + struct meta_page *mp; + + /* + * Should meta_page's go to their own slab? + * One could optimize the performance of the charging routine + * by saving a bit in the page_flags and using it as a lock + * to see if the container page already has a meta_page associated + * with it + */ + lock_meta_page(page); + mp = page_get_meta_page(page); + /* + * The meta_page exists and the page has already been accounted + */ + if (mp) { + atomic_inc(&mp->ref_cnt); + goto done; + } + + unlock_meta_page(page); + + mp = kzalloc(sizeof(struct meta_page), GFP_KERNEL); + if (mp == NULL) + goto err; + + rcu_read_lock(); + /* + * We always charge the container the mm_struct belongs to + * the mm_struct's mem_container changes on task migration if the + * thread group leader migrates. It's possible that mm is not + * set, if so charge the init_mm (happens for pagecache usage). + */ + if (!mm) + mm = &init_mm; + + mem = rcu_dereference(mm->mem_container); + /* + * For every charge from the container, increment reference + * count + */ + css_get(&mem->css); + rcu_read_unlock(); + + /* + * If we created the meta_page, we should free it on exceeding + * the container limit. + */ + if (res_counter_charge(&mem->res, 1)) + goto free_mp; + + lock_meta_page(page); + /* + * Check if somebody else beat us to allocating the meta_page + */ + if (page_get_meta_page(page)) { + atomic_inc(&mp->ref_cnt); + res_counter_uncharge(&mem->res, 1); + goto done; + } + + atomic_set(&mp->ref_cnt, 1); + mp->mem_container = mem; + mp->page = page; + page_assign_meta_page(page, mp); + +done: + unlock_meta_page(page); + return 0; +free_mp: + kfree(mp); + return -ENOMEM; +err: + unlock_meta_page(page); + return -ENOMEM; +} + +/* + * Uncharging is always a welcome operation, we never complain, simply + * uncharge. + */ +void mem_container_uncharge(struct meta_page *mp) +{ + struct mem_container *mem; + struct page *page; + + /* + * This can happen for PAGE_ZERO + */ + if (!mp) + return; + + if (atomic_dec_and_test(&mp->ref_cnt)) { + page = mp->page; + lock_meta_page(page); + mem = mp->mem_container; + css_put(&mem->css); + page_assign_meta_page(page, NULL); + unlock_meta_page(page); + res_counter_uncharge(&mem->res, 1); + kfree(mp); + } +} + static ssize_t mem_container_read(struct container *cont, struct cftype *cft, struct file *file, char __user *userbuf, size_t nbytes, loff_t *ppos) @@ -142,6 +270,8 @@ static int mem_container_create(struct c res_counter_init(&mem->res); cont->subsys[mem_container_subsys_id] = &mem->css; mem->css.container = cont; + INIT_LIST_HEAD(&mem->active_list); + INIT_LIST_HEAD(&mem->inactive_list); return 0; } diff -puN mm/memory.c~mem-control-accounting mm/memory.c --- linux-2.6.22-rc6/mm/memory.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/memory.c 2007-07-05 13:45:18.000000000 -0700 @@ -50,6 +50,7 @@ #include <linux/delayacct.h> #include <linux/init.h> #include <linux/writeback.h> +#include <linux/memcontrol.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -1226,6 +1227,10 @@ static int insert_page(struct mm_struct pte_t *pte; spinlock_t *ptl; + retval = mem_container_charge(page, mm); + if (retval) + goto out; + retval = -EINVAL; if (PageAnon(page)) goto out; @@ -1731,6 +1736,9 @@ gotten: cow_user_page(new_page, old_page, address, vma); } + if (mem_container_charge(new_page, mm)) + goto oom; + /* * Re-check the pte - we dropped the lock */ @@ -2188,6 +2196,11 @@ static int do_swap_page(struct mm_struct } delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_container_charge(page, mm)) { + ret = VM_FAULT_OOM; + goto out; + } + mark_page_accessed(page); lock_page(page); @@ -2255,6 +2268,7 @@ static int do_anonymous_page(struct mm_s pte_t entry; if (write_access) { + /* Allocate our own private page. */ pte_unmap(page_table); @@ -2264,6 +2278,9 @@ static int do_anonymous_page(struct mm_s if (!page) goto oom; + if (mem_container_charge(page, mm)) + goto oom; + entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2397,6 +2414,11 @@ static int __do_fault(struct mm_struct * } + if (mem_container_charge(page, mm)) { + fdata.type = VM_FAULT_OOM; + goto out; + } + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* diff -puN mm/migrate.c~mem-control-accounting mm/migrate.c --- linux-2.6.22-rc6/mm/migrate.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/migrate.c 2007-07-05 13:45:18.000000000 -0700 @@ -28,6 +28,7 @@ #include <linux/mempolicy.h> #include <linux/vmalloc.h> #include <linux/security.h> +#include <linux/memcontrol.h> #include "internal.h" @@ -157,6 +158,11 @@ static void remove_migration_pte(struct return; } + if (mem_container_charge(new, mm)) { + pte_unmap(ptep); + return; + } + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; diff -puN mm/page_alloc.c~mem-control-accounting mm/page_alloc.c --- linux-2.6.22-rc6/mm/page_alloc.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/page_alloc.c 2007-07-05 13:45:18.000000000 -0700 @@ -41,6 +41,7 @@ #include <linux/pfn.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> +#include <linux/memcontrol.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1015,6 +1016,7 @@ static void fastcall free_hot_cold_page( if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page), PAGE_SIZE); + page_assign_meta_page(page, NULL); arch_free_page(page, 0); kernel_map_pages(page, 1, 0); @@ -2576,6 +2578,7 @@ void __meminit memmap_init_zone(unsigned set_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); + page_assign_meta_page(page, NULL); SetPageReserved(page); /* diff -puN mm/rmap.c~mem-control-accounting mm/rmap.c --- linux-2.6.22-rc6/mm/rmap.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/rmap.c 2007-07-05 13:45:18.000000000 -0700 @@ -643,6 +643,8 @@ void page_remove_rmap(struct page *page, page_clear_dirty(page); set_page_dirty(page); } + + mem_container_uncharge(page_get_meta_page(page)); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } diff -puN mm/swapfile.c~mem-control-accounting mm/swapfile.c --- linux-2.6.22-rc6/mm/swapfile.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/swapfile.c 2007-07-05 13:45:18.000000000 -0700 @@ -506,9 +506,12 @@ unsigned int count_swap_pages(int type, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ -static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +static int unuse_pte(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, swp_entry_t entry, struct page *page) { + if (mem_container_charge(page, vma->vm_mm)) + return -ENOMEM; + inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, @@ -520,6 +523,7 @@ static void unuse_pte(struct vm_area_str * immediately swapped out again after swapon. */ activate_page(page); + return 1; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -529,7 +533,7 @@ static int unuse_pte_range(struct vm_are pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; spinlock_t *ptl; - int found = 0; + int ret = 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { @@ -538,13 +542,12 @@ static int unuse_pte_range(struct vm_are * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); - found = 1; + ret = unuse_pte(vma, pte++, addr, entry, page); break; } } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); - return found; + return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -553,14 +556,16 @@ static inline int unuse_pmd_range(struct { pmd_t *pmd; unsigned long next; + int ret; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (unuse_pte_range(vma, pmd, addr, next, entry, page)) - return 1; + ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + if (ret) + return ret; } while (pmd++, addr = next, addr != end); return 0; } @@ -571,14 +576,16 @@ static inline int unuse_pud_range(struct { pud_t *pud; unsigned long next; + int ret; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (unuse_pmd_range(vma, pud, addr, next, entry, page)) - return 1; + ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + if (ret) + return ret; } while (pud++, addr = next, addr != end); return 0; } @@ -588,6 +595,7 @@ static int unuse_vma(struct vm_area_stru { pgd_t *pgd; unsigned long addr, end, next; + int ret; if (page->mapping) { addr = page_address_in_vma(page, vma); @@ -605,8 +613,9 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (unuse_pud_range(vma, pgd, addr, next, entry, page)) - return 1; + ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + if (ret) + return ret; } while (pgd++, addr = next, addr != end); return 0; } @@ -615,6 +624,7 @@ static int unuse_mm(struct mm_struct *mm swp_entry_t entry, struct page *page) { struct vm_area_struct *vma; + int ret = 0; if (!down_read_trylock(&mm->mmap_sem)) { /* @@ -627,15 +637,11 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && unuse_vma(vma, entry, page)) + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; } up_read(&mm->mmap_sem); - /* - * Currently unuse_mm cannot fail, but leave error handling - * at call sites for now, since we change it from time to time. - */ - return 0; + return ret; } /* diff -puN mm/swap_state.c~mem-control-accounting mm/swap_state.c --- linux-2.6.22-rc6/mm/swap_state.c~mem-control-accounting 2007-07-05 13:45:18.000000000 -0700 +++ linux-2.6.22-rc6-balbir/mm/swap_state.c 2007-07-05 18:28:08.000000000 -0700 @@ -17,6 +17,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/migrate.h> +#include <linux/memcontrol.h> #include <asm/pgtable.h> @@ -79,6 +80,11 @@ static int __add_to_swap_cache(struct pa BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { + + error = mem_container_charge(page, current->mm); + if (error) + goto out; + write_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); @@ -93,6 +99,7 @@ static int __add_to_swap_cache(struct pa write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); } +out: return error; } @@ -129,6 +136,7 @@ void __delete_from_swap_cache(struct pag BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); + mem_container_uncharge(page_get_meta_page(page)); radix_tree_delete(&swapper_space.page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); _ -- Warm Regards, Balbir Singh Linux Technology Center IBM, ISTL - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/