This is just an extract from the last CKRM/RG memory controller sent by Chandra last week, necessary to connect a new controller to the existing CKRM/RG code.
Minor changes in this part. Signed-off-by: Patrick Le Dot <[EMAIL PROTECTED]> --- include/linux/mem_rc.h | 71 +++++++ include/linux/mem_rc_inline.h | 96 +++++++++ init/Kconfig | 9 kernel/res_group/Makefile | 1 kernel/res_group/memcore.c | 415 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 592 insertions(+) diff -Naurp a/include/linux/mem_rc.h b/include/linux/mem_rc.h --- a/include/linux/mem_rc.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/mem_rc.h 2006-10-03 09:34:21.000000000 +0200 @@ -0,0 +1,71 @@ +/* include/linux/mem_rc.h : memory control for Resource Groups + * + * Copyright (C) Jiantao Kong, IBM Corp. 2003 + * (C) Shailabh Nagar, IBM Corp. 2003 + * (C) Chandra Seetharaman, IBM Corp. 2004 + * (C) Patrick Le Dot <[EMAIL PROTECTED]@bull.net> 2006 + * + * + * Memory control functions of the Resource Groups kernel API + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _LINUX_MEM_RC_H +#define _LINUX_MEM_RC_H + +#ifdef CONFIG_RES_GROUPS_MEM_RC + +#include <linux/list.h> +#include <linux/kref.h> +#include <linux/mmzone.h> +#include <linux/res_group_rc.h> + +struct mem_res_group { + struct resource_group *rgroup; /* the resource group i am part of... */ + struct res_shares shares; + struct list_head res_list; /* list of all res groups */ + unsigned long flags; + unsigned int bit_id; /* the group_id in bitmaps */ + struct kref nr_users; /* ref count */ + atomic_t pg_inuse; /* # of pages in use in the group */ + int max_pg_used; /* max of pages ever used */ + int max_shrink_atlimit; /* max of shrink_class ever called */ + spinlock_t cnt_lock; + int pg_max_shares; /* # of pages at the limit (max shares) */ + int pg_min_shares; /* # of pages under garantee (min shares */ + struct list_head shrink_list; /* list of classes that are near + * limit and need to be shrunk + */ + atomic_t shrink_count; + unsigned long last_shrink; +}; + +extern struct res_controller mem_ctlr; +extern struct mem_res_group *mem_root_res_group; +extern struct list_head mem_res_group_list; +extern spinlock_t mem_res_group_lock; +extern int nr_mem_res_groups; +extern unsigned int tot_lru_pages; +extern unsigned int rgroup_guarantee; +extern unsigned int rgroup_limit; +extern int num_shrinks; +extern int shrink_to; +extern int shrink_at; +extern int shrink_interval; + +extern void rg_mem_release(struct kref *); +extern void rg_mem_add_page(struct page *page, struct mem_res_group *res); +extern void rg_mem_remove_page(struct page *page, struct mem_res_group *res); +extern void rg_mem_migrate_mm(struct mm_struct* mm, struct mem_res_group *old, + struct mem_res_group *new); + +#endif /* CONFIG_RES_GROUPS_MEM_RC */ + +#endif /* _LINUX_MEM_RC_H */ diff -Naurp a/include/linux/mem_rc_inline.h b/include/linux/mem_rc_inline.h --- a/include/linux/mem_rc_inline.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/mem_rc_inline.h 2006-10-03 09:34:21.000000000 +0200 @@ -0,0 +1,96 @@ +/* include/linux/mem_rc_inline.h : memory control for Resource Groups + * + * Copyright (C) Jiantao Kong, IBM Corp. 2003 + * (C) Shailabh Nagar, IBM Corp. 2003 + * (C) Chandra Seetharaman, IBM Corp. 2004 + * (C) Patrick Le Dot <[EMAIL PROTECTED]@bull.net> 2006 + * + * + * Memory control functions of the Resource Groups kernel API + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _LINUX_MEM_RC_INLINE_H_ +#define _LINUX_MEM_RC_INLINE_H_ + +#include <linux/rmap.h> +#include <linux/mmzone.h> +#include <linux/mem_rc.h> + +#ifdef CONFIG_RES_GROUPS_MEM_RC + +static inline struct mem_res_group *get_shares_mem_rgroup( + struct res_shares *shares) +{ + if (shares) + return container_of(shares, struct mem_res_group, shares); + return NULL; +} + +static inline struct mem_res_group *task_mem_rgroup(struct task_struct *tsk) +{ + return get_shares_mem_rgroup(get_controller_shares(tsk->res_group, + &mem_ctlr)); +} + +static inline struct mem_res_group *get_mem_rgroup(struct resource_group *res) +{ + return get_shares_mem_rgroup(get_controller_shares(res, + &mem_ctlr)); +} + +static inline void res_group_inc_active_list(struct page *page) +{ + struct mem_res_group *res = task_mem_rgroup(current) + ?: mem_root_res_group; + if (res == NULL) + return; + // rg_mem_add_page(page, res); +} + +static inline void res_group_dec_active_list(struct page *page) +{ + // rg_mem_remove_page(page, NULL); +} + +static inline void res_group_inc_inactive_list(struct page *page) +{ + struct mem_res_group *res = task_mem_rgroup(current) + ?: mem_root_res_group; + + if (res == NULL) + return; + // rg_mem_add_page(page, res); +} + +static inline void res_group_dec_inactive_list(struct page *page) +{ + // rg_mem_remove_page(page, NULL); +} + +static inline void res_group_page_init(struct page *page) +{ + +} + +#else + +static inline void *task_mem_rgroup(struct task_struct *tsk) +{ + return NULL; +} + +static inline void res_group_inc_active_list(struct page *p) {} +static inline void res_group_dec_active_list(struct page *p) {} +static inline void res_group_inc_inactive_list(struct page *p) {} +static inline void res_group_dec_inactive_list(struct page *p) {} + +#endif +#endif /* _LINUX_MEM_RC_INLINE_H_ */ diff -Naurp a/init/Kconfig b/init/Kconfig --- a/init/Kconfig 2006-10-03 09:35:37.000000000 +0200 +++ b/init/Kconfig 2006-10-03 09:34:21.000000000 +0200 @@ -307,6 +307,15 @@ config RES_GROUPS_NUMTASKS Say N if unsure, Y to use the feature. +config RES_GROUPS_MEM_RC + bool "Memory Resource Controller" + depends on RES_GROUPS + default y + help + Provide the basic support for collecting physical memory usage + information among resource groups. Say Y if you want to know the + memory usage of each resource group. + endmenu config SYSCTL bool "Sysctl support" if EMBEDDED diff -Naurp a/kernel/res_group/Makefile b/kernel/res_group/Makefile --- a/kernel/res_group/Makefile 2006-10-03 09:35:37.000000000 +0200 +++ b/kernel/res_group/Makefile 2006-10-03 09:34:21.000000000 +0200 @@ -1,3 +1,4 @@ obj-y = res_group.o shares.o task.o obj-$(CONFIG_RES_GROUPS_NUMTASKS) += numtasks.o +obj-$(CONFIG_RES_GROUPS_MEM_RC) += memcore.o obj-$(CONFIG_RGCS) += rgcs.o diff -Naurp a/kernel/res_group/memcore.c b/kernel/res_group/memcore.c --- a/kernel/res_group/memcore.c 1970-01-01 01:00:00.000000000 +0100 +++ b/kernel/res_group/memcore.c 2006-10-03 09:34:21.000000000 +0200 @@ -0,0 +1,415 @@ +/* memcore.c - Memory Resource Manager for Resource Groups + * + * Copyright (C) Jiantao Kong, IBM Corp. 2003 + * (C) Chandra Seetharaman, IBM Corp. 2004 + * (C) Valerie Clement <[EMAIL PROTECTED]> 2004 + * (C) Patrick Le Dot <[EMAIL PROTECTED]@bull.net> 2006 + * + * Provides a Memory Resource controller for Resource Groups + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/cache.h> +#include <linux/percpu.h> +#include <linux/pagevec.h> +#include <linux/parser.h> +#include <linux/mem_rc_inline.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/errno.h> + +static const char res_ctlr_name[] = "mem"; + +unsigned int tot_lru_pages; /* # of pages in the system */ +unsigned int rgroup_guarantee = 0; /* group_bit=1 when usage < guarantee */ +unsigned int rgroup_limit = 0; /* group_bit=1 when usage = limit */ +unsigned int rg_bitmap_shift_index = 0; +int nr_mem_res_groups = 0; +struct mem_res_group *mem_root_res_group = NULL; + +LIST_HEAD(mem_res_group_list); +spinlock_t mem_res_group_lock; /* protects list above */ + +#define DEF_SHRINK_AT 90 +#define DEF_SHRINK_TO 80 +#define DEF_SHRINK_COUNT 10 +#define DEF_SHRINK_INTERVAL 10 + +int shrink_at __read_mostly = DEF_SHRINK_AT; +int shrink_to __read_mostly = DEF_SHRINK_TO; +int num_shrinks __read_mostly = DEF_SHRINK_COUNT; +int shrink_interval __read_mostly = DEF_SHRINK_INTERVAL; + +void rg_mem_release(struct kref *kref) +{ + struct mem_res_group *res = container_of(kref, + struct mem_res_group, nr_users); + kfree(res); +} + +static void set_tot_pages(void) +{ + struct zone *zone; + int i = 0; + + for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + i += zone->nr_active; + i += zone->nr_inactive; + i += zone->free_pages; + } + tot_lru_pages = i; +} + +static void mem_res_init_one(struct mem_res_group *mem_res) +{ + + mem_res->shares.min_shares = SHARE_UNSUPPORTED; + mem_res->shares.max_shares = SHARE_UNSUPPORTED; + mem_res->shares.child_shares_divisor = SHARE_DEFAULT_DIVISOR; + mem_res->shares.unused_min_shares = SHARE_DEFAULT_DIVISOR; + + mem_res->pg_max_shares = 0; + mem_res->pg_min_shares = 0; + mem_res->last_shrink = jiffies; + + mem_res->cnt_lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&mem_res->res_list); + INIT_LIST_HEAD(&mem_res->shrink_list); + + mem_res->bit_id = (1 << rg_bitmap_shift_index); + + kref_init(&mem_res->nr_users); +} + +static struct res_shares *mem_alloc_shares_struct(struct resource_group *rgroup) +{ + struct mem_res_group *res; + + res = kzalloc(sizeof(struct mem_res_group), GFP_ATOMIC); + if (!res) + return NULL; + + res->rgroup = rgroup; + mem_res_init_one(res); + rg_bitmap_shift_index++; + if (is_res_group_root(res->rgroup)) { + res->pg_max_shares = tot_lru_pages; + res->pg_min_shares = tot_lru_pages; + mem_root_res_group = res; + } + spin_lock_irq(&mem_res_group_lock); + list_add(&res->res_list, &mem_res_group_list); + spin_unlock_irq(&mem_res_group_lock); + nr_mem_res_groups++; + + return &res->shares; +} + +static int recalc_shares(int self_shares, int parent_shares, int parent_divisor) +{ + u64 numerator; + + if ((self_shares == SHARE_DONT_CARE) || + (parent_shares == SHARE_DONT_CARE)) + return SHARE_DONT_CARE; + if (parent_divisor == 0) + return 0; + numerator = (u64) self_shares * parent_shares; + do_div(numerator, parent_divisor); + return numerator; +} + +static void recalc_self(struct mem_res_group *res, + struct mem_res_group *parres) +{ + struct res_shares *par = &parres->shares; + struct res_shares *self = &res->shares; + + if (self->max_shares != SHARE_UNSUPPORTED) + res->pg_max_shares = recalc_shares(self->max_shares, + parres->pg_max_shares, + par->child_shares_divisor); + + if (self->min_shares != SHARE_UNSUPPORTED) + res->pg_min_shares = recalc_shares(self->min_shares, + parres->pg_min_shares, + par->child_shares_divisor); +} +/* + * Recalculate the min_shares and max_shares in # of pages... and propagate the + * same to children. + * Caller is responsible for protecting integrity of self_shares and + * parent_shares + */ +static void recalc_and_propagate(struct mem_res_group * res, + struct mem_res_group * parres) +{ + struct resource_group *child = NULL; + struct mem_res_group *childres; + + if (parres) + recalc_self(res, parres); + + /* propagate to children */ + spin_lock(&res->rgroup->group_lock); + for_each_child(child, res->rgroup) { + childres = get_mem_rgroup(child); + BUG_ON(!childres); + spin_lock(&childres->cnt_lock); + recalc_and_propagate(childres, res); + spin_unlock(&childres->cnt_lock); + } + spin_unlock(&res->rgroup->group_lock); + return; +} + +static void res_group_migrate_all_pages(struct mem_res_group* from, + struct mem_res_group* dest) +{ + // expensive walk : each page of the group should be updated... + // for each task of the group_from + // mem_move_task(task, from, dest); +} + +static void mem_free_shares_struct(struct res_shares *my_res) +{ + struct mem_res_group *res, *parres; + + res = get_shares_mem_rgroup(my_res); + if (!res) + return; + + if (!is_res_group_root(res->rgroup)) { + parres = get_mem_rgroup(res->rgroup->parent); + res_group_migrate_all_pages(res, parres); + } + + /* + * Making it all zero as freeing of data structure could + * happen later. + */ + res->shares.min_shares = 0; + res->shares.max_shares = 0; + res->pg_max_shares = 0; + res->pg_min_shares = 0; + + spin_lock_irq(&mem_res_group_lock); + list_del_init(&res->res_list); + spin_unlock_irq(&mem_res_group_lock); + + res->rgroup = NULL; + kref_put(&res->nr_users, rg_mem_release); + nr_mem_res_groups--; + return; +} + +static void mem_shares_changed(struct res_shares *my_res) +{ + struct mem_res_group *res, *parres; + struct res_shares *par; + + res = get_shares_mem_rgroup(my_res); + if (!res) + return; + + if (!is_res_group_root(res->rgroup)) { + parres = get_mem_rgroup(res->rgroup->parent); + spin_lock(&parres->cnt_lock); + par = &parres->shares; + } else { + parres = NULL; + par = NULL; + } + spin_lock(&res->cnt_lock); + + recalc_and_propagate(res, parres); + spin_unlock(&res->cnt_lock); + if (!is_res_group_root(res->rgroup)) + spin_unlock(&parres->cnt_lock); +} + +static ssize_t mem_show_stats(struct res_shares *my_res, + char *buf, size_t buf_size) +{ + struct mem_res_group *res; + struct zone *zone; + int active = 0, inactive = 0, fr = 0; + ssize_t i, j = 0; + u64 temp; + + res = get_shares_mem_rgroup(my_res); + if (!res) + return -EINVAL; + + if (res == mem_root_res_group) { + for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + active += zone->nr_active; + inactive += zone->nr_inactive; + fr += zone->free_pages; + } + i = snprintf(buf, buf_size,"%s: System: tot_pages=%d," + " active=%d, inactive=%d, free=%d\n", + res_ctlr_name, tot_lru_pages, active, + inactive, fr); + buf += i; j += i; buf_size -= i; + } + i = snprintf(buf, buf_size, "%s: Current number of pages in use %d\n", + res_ctlr_name, atomic_read(&res->pg_inuse)); + buf += i; j += i; buf_size -= i; + temp = (u64)(res->max_pg_used * res->shares.child_shares_divisor); + do_div(temp, tot_lru_pages); + i = snprintf(buf, buf_size, "%s: Maximum of pages ever used %d (%d%%)\n", + res_ctlr_name, res->max_pg_used, (int)temp); + buf += i; j += i; buf_size -= i; + i = snprintf(buf, buf_size, "%s: Maximum of pages with guarantee %d\n", + res_ctlr_name, res->pg_min_shares); + buf += i; j += i; buf_size -= i; + i = snprintf(buf, buf_size, "%s: Maximum of pages at limit %d\n", + res_ctlr_name, res->pg_max_shares); + buf += i; j += i; buf_size -= i; + i = snprintf(buf, buf_size, "%s: Maximum of shrink ever called %d\n", + res_ctlr_name, res->max_shrink_atlimit); + j += i; + + return j; +} + +static int mem_reset_stats(struct res_shares *my_res, const char *cfgstr) +{ + struct mem_res_group *res; + + res = get_shares_mem_rgroup(my_res); + if (!res) + return -EINVAL; + res->max_shrink_atlimit = 0; + res->max_pg_used = 0; + return 0; +} + +static void mem_move_task(struct task_struct *tsk, + struct res_shares *old, struct res_shares *new) +{ + struct mm_struct *mm; + struct task_struct *task = tsk; + struct mem_res_group *oldres, *newres; + + oldres = get_shares_mem_rgroup(old); + if (!oldres) + oldres = get_mem_rgroup(task->real_parent->res_group); + BUG_ON(!oldres); + newres = get_shares_mem_rgroup(new); + if (!task->mm || (newres == oldres)) + return; + + mm = task->active_mm; + // rg_mem_migrate_mm(mm, oldres, newres); + return; +} + +static int set_mem_config_val(int *var, int old_value, const char *val, + struct kernel_param *kp) +{ + int rc = param_set_int(val, kp); + + if (rc < 0) + return rc; + if (*var < 1) { + *var = old_value; + return -EINVAL; + } + return 0; +} + +static int set_shrink_at(const char *val, struct kernel_param *kp) +{ + int prev = shrink_at; + int rc = set_mem_config_val(&shrink_at, prev, val, kp); + if (rc < 0) + return rc; + return 0; +} +module_param_set_call(shrink_at, int, set_shrink_at, S_IRUGO | S_IWUSR); + +static int set_shrink_to(const char *val, struct kernel_param *kp) +{ + int prev = shrink_to; + int rc = set_mem_config_val(&shrink_to, prev, val, kp); + if (rc < 0) + return rc; + return 0; +} +module_param_set_call(shrink_to, int, set_shrink_to, S_IRUGO | S_IWUSR); + +static int set_num_shrinks(const char *val, struct kernel_param *kp) +{ + int prev = num_shrinks; + int rc = set_mem_config_val(&num_shrinks, prev, val, kp); + if (rc < 0) + return rc; + return 0; +} +module_param_set_call(num_shrinks, int, set_num_shrinks, S_IRUGO | S_IWUSR); + +static int set_shrink_interval(const char *val, struct kernel_param *kp) +{ + int prev = shrink_interval; + int rc = set_mem_config_val(&shrink_interval, prev, val, kp); + if (rc < 0) + return rc; + return 0; +} +module_param_set_call(shrink_interval, int, set_shrink_interval, + S_IRUGO | S_IWUSR); + + +struct res_controller mem_ctlr = { + .name = res_ctlr_name, + .depth_supported = 1, + .ctlr_id = NO_RES_ID, + .alloc_shares_struct = mem_alloc_shares_struct, + .free_shares_struct = mem_free_shares_struct, + .shares_changed = mem_shares_changed, + .show_stats = mem_show_stats, + .reset_stats = mem_reset_stats, + .move_task = mem_move_task, +}; + +int __init init_mem_res_group(void) +{ + if (mem_ctlr.ctlr_id != NO_RES_ID) + return -EBUSY; /* already registered */ + + set_tot_pages(); + + spin_lock_init(&mem_res_group_lock); + return register_controller(&mem_ctlr); +} + +void __exit exit_mem_res_group(void) +{ + unregister_controller(&mem_ctlr); +} + +module_init(init_mem_res_group) +module_exit(exit_mem_res_group) +MODULE_LICENSE("GPL"); +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ Patrick Le Dot mailto: [EMAIL PROTECTED]@bull.net Centre UNIX de BULL SAS Phone : +33 4 76 29 73 20 1, Rue de Provence BP 208 Fax : +33 4 76 29 76 00 38130 ECHIROLLES Cedex FRANCE Bull, Architect of an Open World TM www.bull.com ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ ckrm-tech mailing list https://lists.sourceforge.net/lists/listinfo/ckrm-tech