Now that each mem cgroup on the system has a memory.oom_policy tunable to
specify oom kill selection behavior, remove the needless "groupoom" mount
option that requires (1) the entire system to be forced, perhaps
unnecessarily, perhaps unexpectedly, into a single oom policy that
differs from the traditional per process selection, and (2) a remount to
change.

Instead of enabling the cgroup aware oom killer with the "groupoom" mount
option, set the mem cgroup subtree's memory.oom_policy to "cgroup".

Signed-off-by: David Rientjes <rient...@google.com>
---
 Documentation/cgroup-v2.txt | 43 +++++++++++++++++++++----------------------
 include/linux/cgroup-defs.h |  5 -----
 include/linux/memcontrol.h  |  5 +++++
 kernel/cgroup/cgroup.c      | 13 +------------
 mm/memcontrol.c             | 17 ++++++++---------
 5 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1074,6 +1074,10 @@ PAGE_SIZE multiple when read back.
        victim; that is, it will choose the single process with the largest
        memory footprint.
 
+       If "cgroup", the OOM killer will compare mem cgroups as indivisible
+       memory consumers; that is, they will compare mem cgroup usage rather
+       than process memory footprint.  See the "OOM Killer" section.
+
   memory.events
        A read-only flat-keyed file which exists on non-root cgroups.
        The following entries are defined.  Unless specified
@@ -1280,37 +1284,32 @@ belonging to the affected files to ensure correct 
memory ownership.
 OOM Killer
 ~~~~~~~~~~
 
-Cgroup v2 memory controller implements a cgroup-aware OOM killer.
-It means that it treats cgroups as first class OOM entities.
+Cgroup v2 memory controller implements an optional cgroup-aware out of
+memory killer, which treats cgroups as indivisible OOM entities.
 
-Cgroup-aware OOM logic is turned off by default and requires
-passing the "groupoom" option on mounting cgroupfs. It can also
-by remounting cgroupfs with the following command::
+This policy is controlled by memory.oom_policy.  When a memory cgroup is
+out of memory, its memory.oom_policy will dictate how the OOM killer will
+select a process, or cgroup, to kill.  Likewise, when the system is OOM,
+the policy is dictated by the root mem cgroup.
 
-  # mount -o remount,groupoom $MOUNT_POINT
+There are currently two available oom policies:
 
-Under OOM conditions the memory controller tries to make the best
-choice of a victim, looking for a memory cgroup with the largest
-memory footprint, considering leaf cgroups and cgroups with the
-memory.oom_group option set, which are considered to be an indivisible
-memory consumers.
+ - "none": default, choose the largest single memory hogging process to
+   oom kill, as traditionally the OOM killer has always done.
 
-By default, OOM killer will kill the biggest task in the selected
-memory cgroup. A user can change this behavior by enabling
-the per-cgroup memory.oom_group option. If set, it causes
-the OOM killer to kill all processes attached to the cgroup,
-except processes with oom_score_adj set to -1000.
+ - "cgroup": choose the cgroup with the largest memory footprint from the
+   subtree as an OOM victim and kill at least one process, depending on
+   memory.oom_group, from it.
 
-This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
-the memory controller considers only cgroups belonging to the sub-tree
-of the OOM'ing cgroup.
+When selecting a cgroup as a victim, the OOM killer will kill the process
+with the largest memory footprint.  A user can control this behavior by
+enabling the per-cgroup memory.oom_group option.  If set, it causes the
+OOM killer to kill all processes attached to the cgroup, except processes
+with /proc/pid/oom_score_adj set to -1000 (oom disabled).
 
 The root cgroup is treated as a leaf memory cgroup, so it's compared
 with other leaf memory cgroups and cgroups with oom_group option set.
 
-If there are no cgroups with the enabled memory controller,
-the OOM killer is using the "traditional" process-based approach.
-
 Please, note that memory charges are not migrating if tasks
 are moved between different memory cgroups. Moving tasks with
 significant memory footprint may affect OOM victim selection logic.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -81,11 +81,6 @@ enum {
         * Enable cpuset controller in v1 cgroup to use v2 behavior.
         */
        CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
-
-       /*
-        * Enable cgroup-aware OOM killer.
-        */
-       CGRP_GROUP_OOM = (1 << 5),
 };
 
 /* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -64,6 +64,11 @@ enum memcg_oom_policy {
         * oom_badness()
         */
        MEMCG_OOM_POLICY_NONE,
+       /*
+        * Local cgroup usage is used to select a target cgroup, treating each
+        * mem cgroup as an indivisible consumer
+        */
+       MEMCG_OOM_POLICY_CGROUP,
 };
 
 struct mem_cgroup_reclaim_cookie {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1732,9 +1732,6 @@ static int parse_cgroup_root_flags(char *data, unsigned 
int *root_flags)
                if (!strcmp(token, "nsdelegate")) {
                        *root_flags |= CGRP_ROOT_NS_DELEGATE;
                        continue;
-               } else if (!strcmp(token, "groupoom")) {
-                       *root_flags |= CGRP_GROUP_OOM;
-                       continue;
                }
 
                pr_err("cgroup2: unknown option \"%s\"\n", token);
@@ -1751,11 +1748,6 @@ static void apply_cgroup_root_flags(unsigned int 
root_flags)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
-
-               if (root_flags & CGRP_GROUP_OOM)
-                       cgrp_dfl_root.flags |= CGRP_GROUP_OOM;
-               else
-                       cgrp_dfl_root.flags &= ~CGRP_GROUP_OOM;
        }
 }
 
@@ -1763,8 +1755,6 @@ static int cgroup_show_options(struct seq_file *seq, 
struct kernfs_root *kf_root
 {
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
-       if (cgrp_dfl_root.flags & CGRP_GROUP_OOM)
-               seq_puts(seq, ",groupoom");
        return 0;
 }
 
@@ -5922,8 +5912,7 @@ static struct kobj_attribute cgroup_delegate_attr = 
__ATTR_RO(delegate);
 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
 {
-       return snprintf(buf, PAGE_SIZE, "nsdelegate\n"
-                                       "groupoom\n");
+       return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2798,14 +2798,14 @@ bool mem_cgroup_select_oom_victim(struct oom_control 
*oc)
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return false;
 
-       if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
-               return false;
-
        if (oc->memcg)
                root = oc->memcg;
        else
                root = root_mem_cgroup;
 
+       if (root->oom_policy != MEMCG_OOM_POLICY_CGROUP)
+               return false;
+
        select_victim_memcg(root, oc);
 
        return oc->chosen_memcg;
@@ -5412,9 +5412,6 @@ static int memory_oom_group_show(struct seq_file *m, void 
*v)
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        bool oom_group = memcg->oom_group;
 
-       if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
-               return -ENOTSUPP;
-
        seq_printf(m, "%d\n", oom_group);
 
        return 0;
@@ -5428,9 +5425,6 @@ static ssize_t memory_oom_group_write(struct 
kernfs_open_file *of,
        int oom_group;
        int err;
 
-       if (!(cgrp_dfl_root.flags & CGRP_GROUP_OOM))
-               return -ENOTSUPP;
-
        err = kstrtoint(strstrip(buf), 0, &oom_group);
        if (err)
                return err;
@@ -5541,6 +5535,9 @@ static int memory_oom_policy_show(struct seq_file *m, 
void *v)
        enum memcg_oom_policy policy = READ_ONCE(memcg->oom_policy);
 
        switch (policy) {
+       case MEMCG_OOM_POLICY_CGROUP:
+               seq_puts(m, "cgroup\n");
+               break;
        case MEMCG_OOM_POLICY_NONE:
        default:
                seq_puts(m, "none\n");
@@ -5557,6 +5554,8 @@ static ssize_t memory_oom_policy_write(struct 
kernfs_open_file *of,
        buf = strstrip(buf);
        if (!memcmp("none", buf, min(sizeof("none")-1, nbytes)))
                memcg->oom_policy = MEMCG_OOM_POLICY_NONE;
+       else if (!memcmp("cgroup", buf, min(sizeof("cgroup")-1, nbytes)))
+               memcg->oom_policy = MEMCG_OOM_POLICY_CGROUP;
        else
                ret = -EINVAL;
 
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to