OOM guarantee works exactly like low limit, but for OOM, i.e. tasks
inside cgroups above the limit are killed first.

Read/write via memory.oom_guarantee.

Signed-off-by: Vladimir Davydov <[email protected]>
---
 include/linux/memcontrol.h |    6 +++
 include/linux/oom.h        |    2 +-
 mm/memcontrol.c            |   97 +++++++++++++++++++++++++++++++++++++++++++-
 mm/oom_kill.c              |   14 ++++++-
 4 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5507be5af34f..1bab6f0e2b38 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -120,6 +120,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct 
mem_cgroup *memcg);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern bool mem_cgroup_below_oom_guarantee(struct task_struct *p);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                        struct task_struct *p);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
@@ -342,6 +343,11 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum 
lru_list lru,
 {
 }
 
+static inline bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+{
+       return false;
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index c13af3feba30..17100d02e8d3 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -67,7 +67,7 @@ extern void check_panic_on_oom(enum oom_constraint 
constraint, gfp_t gfp_mask,
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
-               bool force_kill);
+               bool force_kill, bool ignore_memcg_guarantee);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *mask, bool force_kill);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75add1495418..8e4331340571 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -284,6 +284,8 @@ struct mem_cgroup {
        atomic_long_t mem_failcnt;
        atomic_long_t swap_failcnt;
 
+       unsigned long long oom_guarantee;
+
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
@@ -1550,6 +1552,51 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct 
mem_cgroup *memcg)
        return true;
 }
 
+static bool __mem_cgroup_below_oom_guarantee(struct mem_cgroup *root,
+                                            struct mem_cgroup *memcg)
+{
+       if (mem_cgroup_disabled())
+               return false;
+
+       if (memcg == root_mem_cgroup)
+               return false;
+
+       if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
+                                       memcg->oom_guarantee)
+               return false;
+
+       while (memcg != root) {
+               memcg = parent_mem_cgroup(memcg);
+               if (!memcg)
+                       break;
+
+               if (memcg == root_mem_cgroup)
+                       break;
+
+               if (res_counter_read_u64(&memcg->memsw, RES_USAGE) >=
+                                               memcg->oom_guarantee)
+                       return false;
+       }
+       return true;
+}
+
+bool mem_cgroup_below_oom_guarantee(struct task_struct *p)
+{
+       struct mem_cgroup *memcg = NULL;
+       bool ret = false;
+
+       p = find_lock_task_mm(p);
+       if (p) {
+               memcg = try_get_mem_cgroup_from_mm(p->mm);
+               task_unlock(p);
+       }
+       if (memcg) {
+               ret = __mem_cgroup_below_oom_guarantee(root_mem_cgroup, memcg);
+               css_put(&memcg->css);
+       }
+       return ret;
+}
+
 #define mem_cgroup_from_res_counter(counter, member)   \
        container_of(counter, struct mem_cgroup, member)
 
@@ -1838,6 +1885,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
        unsigned long totalpages;
        unsigned int points = 0;
        struct task_struct *chosen = NULL;
+       bool ignore_memcg_guarantee = false;
 
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -1851,15 +1899,20 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
 
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+retry:
        for_each_mem_cgroup_tree(iter, memcg) {
                struct cgroup *cgroup = iter->css.cgroup;
                struct cgroup_iter it;
                struct task_struct *task;
 
+               if (!ignore_memcg_guarantee &&
+                   __mem_cgroup_below_oom_guarantee(memcg, iter))
+                       continue;
+
                cgroup_iter_start(cgroup, &it);
                while ((task = cgroup_iter_next(cgroup, &it))) {
                        switch (oom_scan_process_thread(task, totalpages, NULL,
-                                                       false)) {
+                                                       false, true)) {
                        case OOM_SCAN_SELECT:
                                if (chosen)
                                        put_task_struct(chosen);
@@ -1890,8 +1943,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
                cgroup_iter_end(cgroup, &it);
        }
 
-       if (!chosen)
+       if (!chosen) {
+               if (!ignore_memcg_guarantee) {
+                       ignore_memcg_guarantee = true;
+                       goto retry;
+               }
                return;
+       }
        points = chosen_points * 1000 / totalpages;
        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
                         NULL, "Memory cgroup out of memory");
@@ -5054,6 +5112,36 @@ static int mem_cgroup_low_write(struct cgroup *cont, 
struct cftype *cft,
        return 0;
 }
 
+static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
+               struct cftype *cft, struct file *file, char __user *buf,
+               size_t nbytes, loff_t *ppos)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       char str[64];
+       int len;
+
+       len = scnprintf(str, sizeof(str), "%llu\n", memcg->oom_guarantee);
+       return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_oom_guarantee_write(struct cgroup *cont,
+               struct cftype *cft, const char *buffer)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+       unsigned long long val;
+       int ret;
+
+       if (mem_cgroup_is_root(memcg))
+               return -EINVAL;
+
+       ret = res_counter_memparse_write_strategy(buffer, &val);
+       if (ret)
+               return ret;
+
+       memcg->oom_guarantee = val;
+       return 0;
+}
+
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
                unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
@@ -5956,6 +6044,11 @@ static struct cftype mem_cgroup_files[] = {
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
        {
+               .name = "oom_guarantee",
+               .write_string = mem_cgroup_oom_guarantee_write,
+               .read = mem_cgroup_oom_guarantee_read,
+       },
+       {
                .name = "pressure_level",
                .register_event = vmpressure_register_event,
                .unregister_event = vmpressure_unregister_event,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 61c8693215da..a6928b4939cc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct 
zonelist *zonelist,
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
-               bool force_kill)
+               bool force_kill, bool ignore_memcg_guarantee)
 {
        if (task->exit_state)
                return OOM_SCAN_CONTINUE;
@@ -291,6 +291,10 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct 
*task,
                if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
                        return OOM_SCAN_ABORT;
        }
+
+       if (!ignore_memcg_guarantee && mem_cgroup_below_oom_guarantee(task))
+               return OOM_SCAN_CONTINUE;
+
        return OOM_SCAN_OK;
 }
 
@@ -307,13 +311,15 @@ static struct task_struct *select_bad_process(unsigned 
int *ppoints,
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        unsigned long chosen_points = 0;
+       bool ignore_memcg_guarantee = false;
 
        rcu_read_lock();
+retry:
        for_each_process_thread(g, p) {
                unsigned int points;
 
                switch (oom_scan_process_thread(p, totalpages, nodemask,
-                                               force_kill)) {
+                                       force_kill, ignore_memcg_guarantee)) {
                case OOM_SCAN_SELECT:
                        chosen = p;
                        chosen_points = ULONG_MAX;
@@ -334,6 +340,10 @@ static struct task_struct *select_bad_process(unsigned int 
*ppoints,
        }
        if (chosen)
                get_task_struct(chosen);
+       else if (!ignore_memcg_guarantee) {
+               ignore_memcg_guarantee = true;
+               goto retry;
+       }
        rcu_read_unlock();
 
        *ppoints = chosen_points * 1000 / totalpages;
-- 
1.7.10.4

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to