This patch implements the handler for cases when container(s) go over their limit.
Signed-off-by: Rohit Seth <[EMAIL PROTECTED]> mm/container_mm.c | 512 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 512 insertions(+) --- linux-2.6.18-rc6-mm2.org/mm/container_mm.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.18-rc6-mm2.ctn/mm/container_mm.c 2006-09-19 18:23:11.000000000 -0700 @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2006 Rohit Seth <[EMAIL PROTECTED]> + * + * Basic mm reclaimer for containers. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/container.h> +#include <linux/rmap.h> +#include <linux/mm_inline.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/writeback.h> + +#if 0 +#define Cprintk(x...) do { printk(x); } while (0) +#else +#define Cprintk(x...) +#endif + +/* + * Count of task's vmas scanned for active page in one shot. + */ +#define MAX_VMA_COUNT 5 + +/* + * Max number of active pages that get deactivated in each iteration + * of page_limit hit. + */ +#define DEACTIVATE_COUNT 1000 + +/* + * File size smaller than this number of pages is skipped in the + * first scan for inactivating pages. This is also used for scanning + * tasks with anon page count greater than this. + */ +#define PAGE_THRESHOLD_LIMIT 50 + +/* + * Check if container is under its threshold for memory. This threshold is + * currently set at about 12% below the limit. Used to find out if we need to + * deactivate more pages. + */ + +static int container_under_threshold(struct container_struct *ctn, long nr) +{ + int ret = 0; + long count = ctn->page_limit - (ctn->page_limit >> 3); + + if ((atomic_long_read(&ctn->num_file_pages) + + atomic_long_read(&ctn->num_anon_pages) - nr) < count) + ret = 1; + + return ret; +} + +/* + * Internal function that builds a list of vmas belonging to mm whose pages + * are going to be scanned to be deactivated. We only look for anonymous vmas + * It returns the number of vmas populated in vmas array. + */ +static int get_task_vmas(struct mm_struct *mm, struct vm_area_struct **vmas, + struct vm_area_struct *start_vma) +{ + struct vm_area_struct *tmp; + int i = 0; + + + if (start_vma == NULL) + tmp = mm->mmap; + else + tmp = start_vma->vm_next; + + while ((i < MAX_VMA_COUNT) && (tmp != NULL)) { + vmas[i] = tmp; + if (!(tmp->vm_flags & (VM_IO | VM_RESERVED))) { + if (tmp->anon_vma != NULL) + i++; + } + tmp = tmp->vm_next; + } + return i; +} + +/* + * This function iterates over vma's pages. It reinitializes the container + * pointer for a page if it belongs to ctn. It decrements the anon_page and + * active_page count for the container ctn. + * We only check for anonymous pages as the file pages get their container + * inheritance from mapping. + */ +static long anonpages_init(struct container_struct *ctn, + struct vm_area_struct *vma) +{ + struct page *page; + struct zone *z; + unsigned long addr, end; + long ret = 0, active = 0; + + addr = vma->vm_start; + end = vma->vm_end; + + while (addr < end) { + page = follow_page(vma, addr, 0); + if ((page) && (page->ctn == ctn)) { + z = page_zone(page); + spin_lock_irq(&z->lru_lock); + if (page->ctn == ctn) { + if (PageAnon(page)) { + page->ctn = NULL; + atomic_long_dec(&ctn->num_anon_pages); + ret++; + if (PageActive(page) && PageLRU(page)) + active++; + } + } + spin_unlock_irq(&z->lru_lock); + } + addr += PAGE_SIZE; + } + atomic_long_sub(active, &ctn->num_active_pages); + return ret; +} + +/* + * This function is called when a task is explicitly requested to be moved out + * of its container through /configfs/containers/<container>/rmtask interface. + * We iterate over task's vmas and reinitialize the container pointer in + * every anon page mapping of this task. + */ +long anonpages_sub(struct task_struct *task, struct container_struct *ctn) +{ + struct vm_area_struct *vmas[MAX_VMA_COUNT]; + struct vm_area_struct *start_vma = NULL; + struct mm_struct *mm; + long ret = 0; + int count, i; + + mm = get_task_mm(task); + if (!mm) + return -1; + Cprintk("anonpages_sub scanning for task %d... ", task->pid); + /* + * No more new page faults beyond this point till the time + * we finish scanning. + */ + down_write(&mm->mmap_sem); + + /* + * Iterate over the vmas to deactivate pages. + */ + while ((count = get_task_vmas(mm, vmas, start_vma)) > 0) { + i = 0; + while (i < count) + ret += anonpages_init(ctn, vmas[i++]); + + if (count == MAX_VMA_COUNT) + /* + * Make get_task_vmas start from after the last + * scanned vma. + */ + start_vma = vmas[MAX_VMA_COUNT-1]; + else + break; + } + up_write(&mm->mmap_sem); + mmput(mm); + Cprintk("returning %ld \n", ret); + return ret; +} + +/* + * Write back some of the pages belonging to this container. container + * mutex is already held. + */ +void writeback_container_file_pages(struct container_struct *ctn) +{ + struct address_space *mapping; + long flushed = 0; + long limit = (ctn->page_limit >> 1); + long invalidated = 0; + long temp; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + list_for_each_entry(mapping, &ctn->mappings, ctn_mapping_list) { + if (mapping->nrpages > limit) + temp = limit; + else + temp = mapping->nrpages; + wbc.nr_to_write = temp; + ret = do_writepages(mapping, &wbc); + flushed += (temp - wbc.nr_to_write); + invalidated += invalidate_inode_pages(mapping); + if (flushed > DEACTIVATE_COUNT) + break; + } + Cprintk("Writeback %ld %ld\n", flushed, invalidated); +} + +/* + * If the page is active then this function marks it inactive, clears the + * reference bit (so as to make it a better target for reclaim) and also + * puts it on inactive list. This function also decrements the zone's + * active number of pages. + */ +static int deactivate_container_page(struct page *page) +{ + struct zone *z; + int ret = 0, tmp; + + if (!PageActive(page)) + goto out; + if (!PageLRU(page)) + goto out; + + z = page_zone(page); + spin_lock_irq(&z->lru_lock); + if (PageActive(page) && PageLRU(page)) { + /* + * Not calling del_page_from_active_list so as to reduce + * the number of atomic dec. We do one atomic_long_sub + * after required number of pages have been deactivated. + */ + list_del(&page->lru); + ClearPageActive(page); + z->nr_active--; + ret++; + add_page_to_inactive_list(z, page); + } + spin_unlock_irq(&z->lru_lock); +out: + tmp = page_referenced(page, 0); + return ret; +} + +#ifdef CONFIG_SWAP +/* + * This function tries to inactive any (anonymous) active pages belonging to the vma. + */ +static int deactivate_pages(struct container_struct *ctn, struct vm_area_struct *vma) +{ + struct page *page; + unsigned long addr, end; + int deactivated = 0; + + addr = vma->vm_start; + end = vma->vm_end; + + while (addr < end) { + page = follow_page(vma, addr, 0); + if (page && (page->ctn == ctn) && PageAnon(page)) { + deactivated += deactivate_container_page(page); + if (deactivated >= DEACTIVATE_COUNT) + break; /* We have deactivated enough pages. */ + } + addr += PAGE_SIZE; + } + atomic_long_sub(deactivated, &ctn->num_active_pages); + return deactivated; +} + +/* + * This is the core of container anonymous page reclaimer. It gets called + * whenever page_limit of a container is hit and we are not able to + * deactivate enough file backed pages belonging to this container. + * This function traverses over the task list belonging to this container + * and collects vma for each task till the time it can deactivate at least + * DEACTIVATE_COUNT pages. + * We use the get_task_mm and mmput to avoid handling of freed up mm. + */ +static int sync_container_anon_pages(struct container_struct *ctn) +{ + struct vm_area_struct *vmas[MAX_VMA_COUNT]; + struct vm_area_struct *start_vma; + struct task_struct *tsk; + struct mm_struct *mm; + int i, nr = 0, count; + int vma_scanned = 0; + int small_task = 0; + +again: + list_for_each_entry(tsk, &ctn->tasks, ctn_task_list) { + start_vma = NULL; + + if (tsk->flags & PF_EXITING) + continue; + + mm = get_task_mm(tsk); + if (!mm) + continue; + if (!small_task) + if (get_mm_counter(mm, anon_rss) < PAGE_THRESHOLD_LIMIT) + goto next_task; + down_read(&mm->mmap_sem); + /* + * Iterate over the anonymous vmas to deactivate + * pages. + */ + while ((count = get_task_vmas(mm, vmas, start_vma)) > 0) { + i = 0; + + vma_scanned += count; + while ((i < count) && (nr < DEACTIVATE_COUNT)) + nr += deactivate_pages(ctn, vmas[i++]); + + if (nr >= DEACTIVATE_COUNT) + break; + + if (count == MAX_VMA_COUNT) + /* + * Make get_task_vmas start from after the last + * scanned vma. + */ + start_vma = vmas[MAX_VMA_COUNT-1]; + else + break; + } + up_read(&mm->mmap_sem); +next_task: + mmput(mm); + + /* + * Break out of the loop if we have deactivated enough pages. + * This should get the currently used pages to some what below + * the limit (currently set at about 12%). + */ + if (container_under_threshold(ctn, nr)) + goto out; + } + if (!small_task) { + small_task = 1; + goto again; + } +out: + Cprintk("vma scanned %d Anon returning: %d\n", vma_scanned, nr); + return nr; + +} + +#endif /*CONFIG_SWAP */ + +/* + * This function deactivates any of the active pages present in page + * array pl. count is the number of pages contained in the array. + */ +int deactivate_page_array(struct page **page_array, int count) +{ + int i = 0; + int ret = 0; + struct page *page; + + while (i < count) { + page = page_array[i++]; + ret += deactivate_container_page(page); + put_page(page); + } + return ret; +} + +/* + * Find all the pages belonging to mapping and reinitialize + * its container pointer to point to ctn. + */ +long filepages_to_new_container(struct address_space *mapping, + struct container_struct *ctn) +{ + struct pagevec pvec; + long ret = 0; + int i; + pgoff_t next = 0; + +again: + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + ret += pagevec_count(&pvec); + for (i=0; i< pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + BUG_ON(page->ctn != ctn); + page->ctn = ctn; + if (page_index > next) + next = page_index; + next++; + } + } + if (ret < mapping->nrpages) + goto again; + return ret; +} + +/* + * This is the core of container file page reclaimer. It gets called + * whenever page_limit of a container is hit. + * This function traverses over the list of files belonging to this container + * and collects pages for each file till the time it can deactivate at least + * DEACTIVATE_COUNT pages. + * It is a two pass algorithm. First it looks for files that have larger than + * PAGE_THRESHOLD_LIMIT pages mapped. If we are not able to deactivate + * DEACTIVATE_COUNT number of pages then we do a second pass. This second + * pass scans smaller files only. + */ +static int sync_container_file_pages(struct container_struct *ctn) +{ + struct address_space *map; + struct pagevec pvec; + pgoff_t next = 0; + int small_files = 0; + int i; + int ret = 0; + int files_scanned = 0; + unsigned long invalidated = 0; + +again: + list_for_each_entry(map, &ctn->mappings, ctn_mapping_list) { + if (map->nrpages == 0) + continue; + if (!small_files) { /* Look for big files */ + if (map->nrpages < PAGE_THRESHOLD_LIMIT) + continue; + } + /* + * If the number of page cache pages itself is close to the + * limit then invalidate some of these pages here. + */ + if (atomic_long_read(&ctn->num_file_pages) > (ctn->page_limit >> 1)) + invalidated += invalidate_inode_pages(map); + + pagevec_init(&pvec, 0); + /* + * In the second iteration, deactivate more pages from each + * file. + */ + while ((ret < (DEACTIVATE_COUNT * (small_files +1))) && + pagevec_lookup(&pvec, map, next, PAGEVEC_SIZE)) { + for (i=0; i< pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > next) + next = page_index; + next++; + } + ret += deactivate_page_array(pvec.pages, pagevec_count(&pvec)); + } + files_scanned++; + /* + * Break out of the loop if we have deactivated enough pages. + */ + if (container_under_threshold(ctn, ret)) + goto out; + } + + if (!small_files) { + /* Iterate over smaller files also as we are not + * able to deactivate enough pages. + */ + small_files = 1; + /* + * It will block all resource management part for this container. That is + * okay as we are not finding anything that we can easily deactivate. + */ + goto again; + } +out: + atomic_long_sub(ret, &ctn->num_active_pages); + Cprintk("File returning: %d, invalidated %ld\n", ret, invalidated); + return ret; +} + +/* + * This function is called when container's page limit is hit. In this case + * we move the pages (charged against this container) from active list + * to head of inactive list. So that if there is any memory pressure + * then these are the prime candidates for freeing. It first tries to inactivate + * file pages and then goes into inactivating anonymous pages. + * XXX: More smarts should be put here to get to the right pages + * faster and not start iterating over the same range again. + * Container's mutex is already held. This is in context of kcontainerd. + */ +void container_over_pagelimit(struct container_struct *ctn) +{ + int ret =0; + long anon_pages, file_pages; + long tmp; + /* + * We have the mutex so container can not be freed and no new + * resource can be added while over the limit handler is working + * its way through. + */ + file_pages = atomic_long_read(&ctn->num_file_pages); + anon_pages = atomic_long_read(&ctn->num_anon_pages); + tmp = file_pages + anon_pages; + if (tmp > ctn->page_limit) { + if (file_pages > (ctn->page_limit >> 1)) + ret += sync_container_file_pages(ctn); +#ifdef CONFIG_SWAP + anon_pages = atomic_long_read(&ctn->num_anon_pages); + if ((total_swap_pages > 0) && + (anon_pages > (ctn->page_limit >> 6))) + ret += sync_container_anon_pages(ctn); +#endif + } +} ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ ckrm-tech mailing list https://lists.sourceforge.net/lists/listinfo/ckrm-tech