В Чт, 21/05/2015 в 12:50 +0300, Vladimir Davydov пишет: > OOM guarantee works exactly like low limit, but for OOM, i.e. tasks > inside cgroups above the limit are killed first. > > Read/write via memory.oom_guarantee. > > Signed-off-by: Vladimir Davydov <[email protected]>
Reviewed-by: Kirill Tkhai <[email protected]> > --- > include/linux/memcontrol.h | 6 +++ > include/linux/oom.h | 2 +- > mm/memcontrol.c | 97 > +++++++++++++++++++++++++++++++++++++++++++- > mm/oom_kill.c | 14 ++++++- > 4 files changed, 114 insertions(+), 5 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 5507be5af34f..1bab6f0e2b38 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -120,6 +120,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct > mem_cgroup *memcg); > int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); > unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); > void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); > +extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p); > extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, > struct task_struct *p); > extern void mem_cgroup_replace_page_cache(struct page *oldpage, > @@ -342,6 +343,11 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum > lru_list lru, > { > } > > +static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p) > +{ > + return false; > +} > + > static inline void > mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) > { > diff --git a/include/linux/oom.h b/include/linux/oom.h > index c13af3feba30..17100d02e8d3 100644 > --- a/include/linux/oom.h > +++ b/include/linux/oom.h > @@ -67,7 +67,7 @@ extern void check_panic_on_oom(enum oom_constraint > constraint, gfp_t gfp_mask, > > extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, > unsigned long totalpages, const nodemask_t *nodemask, > - bool force_kill); > + bool force_kill, bool ignore_memcg_guarantee); > > extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, > int order, nodemask_t *mask, bool force_kill); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 75add1495418..8e4331340571 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -284,6 +284,8 @@ struct mem_cgroup { > atomic_long_t mem_failcnt; > atomic_long_t swap_failcnt; > > + unsigned long long oom_guarantee; > + > /* > * Should the accounting and control be hierarchical, per subtree? > */ > @@ -1550,6 +1552,51 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct > mem_cgroup *memcg) > return true; > } > > +static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root, > + struct mem_cgroup *memcg) > +{ > + if (mem_cgroup_disabled()) > + return false; > + > + if (memcg == root_mem_cgroup) > + return false; > + > + if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >= > + memcg->oom_guarantee) > + return false; > + > + while (memcg != root) { > + memcg = parent_mem_cgroup(memcg); > + if (!memcg) > + break; > + > + if (memcg == root_mem_cgroup) > + break; > + > + if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >= > + memcg->oom_guarantee) > + return false; > + } > + return true; > +} > + > +bool mem_cgroup_below_oom_guarantee(struct task_struct *p) > +{ > + struct mem_cgroup *memcg = NULL; > + bool ret = false; > + > + p = find_lock_task_mm(p); > + if (p) { > + memcg = try_get_mem_cgroup_from_mm(p->mm); > + task_unlock(p); > + } > + if (memcg) { > + ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg); > + css_put(&memcg->css); > + } > + return ret; > +} > + > #define mem_cgroup_from_res_counter(counter, member) \ > container_of(counter, struct mem_cgroup, member) > > @@ -1838,6 +1885,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup > *memcg, gfp_t gfp_mask, > unsigned long totalpages; > unsigned int points = 0; > struct task_struct *chosen = NULL; > + bool ignore_memcg_guarantee = false; > > /* > * If current has a pending SIGKILL or is exiting, then automatically > @@ -1851,15 +1899,20 @@ static void mem_cgroup_out_of_memory(struct > mem_cgroup *memcg, gfp_t gfp_mask, > > check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); > totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; > +retry: > for_each_mem_cgroup_tree(iter, memcg) { > struct cgroup *cgroup = iter->css.cgroup; > struct cgroup_iter it; > struct task_struct *task; > > + if (!ignore_memcg_guarantee && > + __mem_cgroup_below_oom_guarantee(memcg, iter)) > + continue; > + > cgroup_iter_start(cgroup, &it); > while ((task = cgroup_iter_next(cgroup, &it))) { > switch (oom_scan_process_thread(task, totalpages, NULL, > - false)) { > + false, true)) { > case OOM_SCAN_SELECT: > if (chosen) > put_task_struct(chosen); > @@ -1890,8 +1943,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup > *memcg, gfp_t gfp_mask, > cgroup_iter_end(cgroup, &it); > } > > - if (!chosen) > + if (!chosen) { > + if (!ignore_memcg_guarantee) { > + ignore_memcg_guarantee = true; > + goto retry; > + } > return; > + } > points = chosen_points * 1000 / totalpages; > oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, > NULL, "Memory cgroup out of memory"); > @@ -5054,6 +5112,36 @@ static int mem_cgroup_low_write(struct cgroup *cont, > struct cftype *cft, > return 0; > } > > +static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont, > + struct cftype *cft, struct file *file, char __user *buf, > + size_t nbytes, loff_t *ppos) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); > + char str[64]; > + int len; > + > + len = scnprintf(str, sizeof(str), "%llu\n", memcg->oom_guarantee); > + return simple_read_from_buffer(buf, nbytes, ppos, str, len); > +} > + > +static int mem_cgroup_oom_guarantee_write(struct cgroup *cont, > + struct cftype *cft, const char *buffer) > +{ > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); > + unsigned long long val; > + int ret; > + > + if (mem_cgroup_is_root(memcg)) > + return -EINVAL; > + > + ret = res_counter_memparse_write_strategy(buffer, &val); > + if (ret) > + return ret; > + > + memcg->oom_guarantee = val; > + return 0; > +} > + > static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, > unsigned long long *mem_limit, unsigned long long *memsw_limit) > { > @@ -5956,6 +6044,11 @@ static struct cftype mem_cgroup_files[] = { > .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), > }, > { > + .name = "oom_guarantee", > + .write_string = mem_cgroup_oom_guarantee_write, > + .read = mem_cgroup_oom_guarantee_read, > + }, > + { > .name = "pressure_level", > .register_event = vmpressure_register_event, > .unregister_event = vmpressure_unregister_event, > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > index 61c8693215da..a6928b4939cc 100644 > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct > zonelist *zonelist, > > enum oom_scan_t oom_scan_process_thread(struct task_struct *task, > unsigned long totalpages, const nodemask_t *nodemask, > - bool force_kill) > + bool force_kill, bool ignore_memcg_guarantee) > { > if (task->exit_state) > return OOM_SCAN_CONTINUE; > @@ -291,6 +291,10 @@ enum oom_scan_t oom_scan_process_thread(struct > task_struct *task, > if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) > return OOM_SCAN_ABORT; > } > + > + if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task)) > + return OOM_SCAN_CONTINUE; > + > return OOM_SCAN_OK; > } > > @@ -307,13 +311,15 @@ static struct task_struct *select_bad_process(unsigned > int *ppoints, > struct task_struct *g, *p; > struct task_struct *chosen = NULL; > unsigned long chosen_points = 0; > + bool ignore_memcg_guarantee = false; > > rcu_read_lock(); > +retry: > for_each_process_thread(g, p) { > unsigned int points; > > switch (oom_scan_process_thread(p, totalpages, nodemask, > - force_kill)) { > + force_kill, ignore_memcg_guarantee)) { > case OOM_SCAN_SELECT: > chosen = p; > chosen_points = ULONG_MAX; > @@ -334,6 +340,10 @@ static struct task_struct *select_bad_process(unsigned > int *ppoints, > } > if (chosen) > get_task_struct(chosen); > + else if (!ignore_memcg_guarantee) { > + ignore_memcg_guarantee = true; > + goto retry; > + } > rcu_read_unlock(); > > *ppoints = chosen_points * 1000 / totalpages; _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
