В Чт, 21/05/2015 в 12:50 +0300, Vladimir Davydov пишет:
> OOM guarantee works exactly like low limit, but for OOM, i.e. tasks
> inside cgroups above the limit are killed first.
> 
> Read/write via memory.oom_guarantee.
> 
> Signed-off-by: Vladimir Davydov <[email protected]>

Reviewed-by: Kirill Tkhai <[email protected]>

> ---
>  include/linux/memcontrol.h |    6 +++
>  include/linux/oom.h        |    2 +-
>  mm/memcontrol.c            |   97 
> +++++++++++++++++++++++++++++++++++++++++++-
>  mm/oom_kill.c              |   14 ++++++-
>  4 files changed, 114 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5507be5af34f..1bab6f0e2b38 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -120,6 +120,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct 
> mem_cgroup *memcg);
>  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
>  unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
>  void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
> +extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
>  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
>                                       struct task_struct *p);
>  extern void mem_cgroup_replace_page_cache(struct page *oldpage,
> @@ -342,6 +343,11 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum 
> lru_list lru,
>  {
>  }
>  
> +static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
> +{
> +     return false;
> +}
> +
>  static inline void
>  mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  {
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index c13af3feba30..17100d02e8d3 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -67,7 +67,7 @@ extern void check_panic_on_oom(enum oom_constraint 
> constraint, gfp_t gfp_mask,
>  
>  extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
>               unsigned long totalpages, const nodemask_t *nodemask,
> -             bool force_kill);
> +             bool force_kill, bool ignore_memcg_guarantee);
>  
>  extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>               int order, nodemask_t *mask, bool force_kill);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 75add1495418..8e4331340571 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -284,6 +284,8 @@ struct mem_cgroup {
>       atomic_long_t mem_failcnt;
>       atomic_long_t swap_failcnt;
>  
> +     unsigned long long oom_guarantee;
> +
>       /*
>        * Should the accounting and control be hierarchical, per subtree?
>        */
> @@ -1550,6 +1552,51 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct 
> mem_cgroup *memcg)
>       return true;
>  }
>  
> +static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root,
> +                                          struct mem_cgroup *memcg)
> +{
> +     if (mem_cgroup_disabled())
> +             return false;
> +
> +     if (memcg == root_mem_cgroup)
> +             return false;
> +
> +     if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
> +                                     memcg->oom_guarantee)
> +             return false;
> +
> +     while (memcg != root) {
> +             memcg = parent_mem_cgroup(memcg);
> +             if (!memcg)
> +                     break;
> +
> +             if (memcg == root_mem_cgroup)
> +                     break;
> +
> +             if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
> +                                             memcg->oom_guarantee)
> +                     return false;
> +     }
> +     return true;
> +}
> +
> +bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
> +{
> +     struct mem_cgroup *memcg = NULL;
> +     bool ret = false;
> +
> +     p = find_lock_task_mm(p);
> +     if (p) {
> +             memcg = try_get_mem_cgroup_from_mm(p->mm);
> +             task_unlock(p);
> +     }
> +     if (memcg) {
> +             ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg);
> +             css_put(&memcg->css);
> +     }
> +     return ret;
> +}
> +
>  #define mem_cgroup_from_res_counter(counter, member) \
>       container_of(counter, struct mem_cgroup, member)
>  
> @@ -1838,6 +1885,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
> *memcg, gfp_t gfp_mask,
>       unsigned long totalpages;
>       unsigned int points = 0;
>       struct task_struct *chosen = NULL;
> +     bool ignore_memcg_guarantee = false;
>  
>       /*
>        * If current has a pending SIGKILL or is exiting, then automatically
> @@ -1851,15 +1899,20 @@ static void mem_cgroup_out_of_memory(struct 
> mem_cgroup *memcg, gfp_t gfp_mask,
>  
>       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
>       totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
> +retry:
>       for_each_mem_cgroup_tree(iter, memcg) {
>               struct cgroup *cgroup = iter->css.cgroup;
>               struct cgroup_iter it;
>               struct task_struct *task;
>  
> +             if (!ignore_memcg_guarantee &&
> +                 __mem_cgroup_below_oom_guarantee(memcg, iter))
> +                     continue;
> +
>               cgroup_iter_start(cgroup, &it);
>               while ((task = cgroup_iter_next(cgroup, &it))) {
>                       switch (oom_scan_process_thread(task, totalpages, NULL,
> -                                                     false)) {
> +                                                     false, true)) {
>                       case OOM_SCAN_SELECT:
>                               if (chosen)
>                                       put_task_struct(chosen);
> @@ -1890,8 +1943,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
> *memcg, gfp_t gfp_mask,
>               cgroup_iter_end(cgroup, &it);
>       }
>  
> -     if (!chosen)
> +     if (!chosen) {
> +             if (!ignore_memcg_guarantee) {
> +                     ignore_memcg_guarantee = true;
> +                     goto retry;
> +             }
>               return;
> +     }
>       points = chosen_points * 1000 / totalpages;
>       oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
>                        NULL, "Memory cgroup out of memory");
> @@ -5054,6 +5112,36 @@ static int mem_cgroup_low_write(struct cgroup *cont, 
> struct cftype *cft,
>       return 0;
>  }
>  
> +static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
> +             struct cftype *cft, struct file *file, char __user *buf,
> +             size_t nbytes, loff_t *ppos)
> +{
> +     struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
> +     char str[64];
> +     int len;
> +
> +     len = scnprintf(str, sizeof(str), "%llu\n", memcg->oom_guarantee);
> +     return simple_read_from_buffer(buf, nbytes, ppos, str, len);
> +}
> +
> +static int mem_cgroup_oom_guarantee_write(struct cgroup *cont,
> +             struct cftype *cft, const char *buffer)
> +{
> +     struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
> +     unsigned long long val;
> +     int ret;
> +
> +     if (mem_cgroup_is_root(memcg))
> +             return -EINVAL;
> +
> +     ret = res_counter_memparse_write_strategy(buffer, &val);
> +     if (ret)
> +             return ret;
> +
> +     memcg->oom_guarantee = val;
> +     return 0;
> +}
> +
>  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
>               unsigned long long *mem_limit, unsigned long long *memsw_limit)
>  {
> @@ -5956,6 +6044,11 @@ static struct cftype mem_cgroup_files[] = {
>               .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
>       },
>       {
> +             .name = "oom_guarantee",
> +             .write_string = mem_cgroup_oom_guarantee_write,
> +             .read = mem_cgroup_oom_guarantee_read,
> +     },
> +     {
>               .name = "pressure_level",
>               .register_event = vmpressure_register_event,
>               .unregister_event = vmpressure_unregister_event,
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 61c8693215da..a6928b4939cc 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct 
> zonelist *zonelist,
>  
>  enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
>               unsigned long totalpages, const nodemask_t *nodemask,
> -             bool force_kill)
> +             bool force_kill, bool ignore_memcg_guarantee)
>  {
>       if (task->exit_state)
>               return OOM_SCAN_CONTINUE;
> @@ -291,6 +291,10 @@ enum oom_scan_t oom_scan_process_thread(struct 
> task_struct *task,
>               if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
>                       return OOM_SCAN_ABORT;
>       }
> +
> +     if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
> +             return OOM_SCAN_CONTINUE;
> +
>       return OOM_SCAN_OK;
>  }
>  
> @@ -307,13 +311,15 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>       struct task_struct *g, *p;
>       struct task_struct *chosen = NULL;
>       unsigned long chosen_points = 0;
> +     bool ignore_memcg_guarantee = false;
>  
>       rcu_read_lock();
> +retry:
>       for_each_process_thread(g, p) {
>               unsigned int points;
>  
>               switch (oom_scan_process_thread(p, totalpages, nodemask,
> -                                             force_kill)) {
> +                                     force_kill, ignore_memcg_guarantee)) {
>               case OOM_SCAN_SELECT:
>                       chosen = p;
>                       chosen_points = ULONG_MAX;
> @@ -334,6 +340,10 @@ static struct task_struct *select_bad_process(unsigned 
> int *ppoints,
>       }
>       if (chosen)
>               get_task_struct(chosen);
> +     else if (!ignore_memcg_guarantee) {
> +             ignore_memcg_guarantee = true;
> +             goto retry;
> +     }
>       rcu_read_unlock();
>  
>       *ppoints = chosen_points * 1000 / totalpages;


_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to