The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after ark-5.14 ------> commit 5caeccd890d55834d65664760ba56cf3b66fe971 Author: Vladimir Davydov <vdavydov....@gmail.com> Date: Fri Sep 24 15:04:56 2021 +0300
oom: resurrect berserker mode Feature: oom: berserker mode The logic behind the OOM berserker is the same as in PCS6: if processes are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by default), we increase "rage" (min -10, max 20) and kill 1 << "rage" youngest worst processes if "rage" >= 0. https://jira.sw.ru/browse/PSBM-17930 Signed-off-by: Vladimir Davydov <vdavy...@parallels.com> [aryabinin: vz8 rebase] Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> +++ oom: Restore vm.oom_relaxation sysctl 'Berserker mode' is used if the OOM killer has to act too often: if several tasks are killed within 'oom_relaxation' interval, additional tasks will be killed. In VZ7, 'vm.oom_relaxation' sysctl defined that value, but it is missing in VZ8. Restore it, because the default value (1000 jiffies, 1 sec) might be too small, for example, for slow or highly loaded machines. Done in the scope of https://jira.sw.ru/browse/PSBM-131983. Signed-off-by: Evgenii Shatokhin <eshatok...@virtuozzo.com> +++ oom: Initialize oom_rage_lock spinlock Lockdep complained about it as follows: Done in the scope of https://jira.sw.ru/browse/PSBM-131983. Signed-off-by: Evgenii Shatokhin <eshatok...@virtuozzo.com> (cherry picked from vz8 commit 300a06439b2754e3486a00b68c2753e6c27a8f16) Signed-off-by: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com> --- include/linux/memcontrol.h | 12 +++++ include/linux/oom.h | 6 +++ kernel/sysctl.c | 7 +++ mm/memcontrol.c | 27 ++++++++++++ mm/oom_kill.c | 107 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 159 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 84bacc521142..b1feb6a36da0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -278,6 +278,11 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + int oom_rage; + spinlock_t oom_rage_lock; + unsigned long prev_oom_time; + unsigned long oom_time; + /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; @@ -771,6 +776,7 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct lruvec *lock_page_lruvec(struct page *page); @@ -1267,6 +1273,12 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } +static inline bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) +{ + return true; +} + static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/include/linux/oom.h b/include/linux/oom.h index 3f3f23a785fc..3b31f4256aab 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -22,6 +22,10 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; + +#define OOM_BASE_RAGE -10 +#define OOM_MAX_RAGE 20 + /* * Details of the page allocation that triggered the oom killer that are used to * determine what should be killed. @@ -51,6 +55,7 @@ struct oom_control { unsigned long totalpages; struct task_struct *chosen; long chosen_points; + unsigned long overdraft; /* Used to print the constraint info. */ enum oom_constraint constraint; @@ -144,4 +149,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; extern int sysctl_panic_on_oom; +extern int sysctl_oom_relaxation; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5824d5dd2e1d..081e42171745 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2740,6 +2740,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "oom_relaxation", + .data = &sysctl_oom_relaxation, + .maxlen = sizeof(sysctl_oom_relaxation), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3f56f33cf6df..6d882c660c21 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1294,6 +1294,32 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) +{ + struct mem_cgroup *task_memcg; + struct task_struct *p; + bool ret; + + p = find_lock_task_mm(task); + if (p) { + task_memcg = get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + rcu_read_lock(); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); + rcu_read_unlock(); + } + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); + return ret; +} + #ifdef CONFIG_CLEANCACHE bool mem_cgroup_cleancache_disabled(struct page *page) { @@ -5326,6 +5352,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); + spin_lock_init(&memcg->oom_rage_lock); INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0bff802b1887..60c371c655af 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -55,6 +55,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; +int sysctl_oom_relaxation = HZ; /* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -969,6 +970,111 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message) return 0; } +/* + * Kill more processes if oom happens too often in this context. + */ +static void oom_berserker(struct oom_control *oc) +{ + static DEFINE_RATELIMIT_STATE(berserker_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct task_struct *p; + struct mem_cgroup *memcg; + unsigned long now = jiffies; + int rage; + int killed = 0; + + memcg = oc->memcg ?: root_mem_cgroup; + + spin_lock(&memcg->oom_rage_lock); + memcg->prev_oom_time = memcg->oom_time; + memcg->oom_time = now; + /* + * Increase rage if oom happened recently in this context, reset + * rage otherwise. + * + * previous oom this oom (unfinished) + * +++++++++----------------------------++++++++ + * ^ ^ + * prev_oom_time <<oom_relaxation>> oom_time + */ + if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation)) + memcg->oom_rage = OOM_BASE_RAGE; + else if (memcg->oom_rage < OOM_MAX_RAGE) + memcg->oom_rage++; + rage = memcg->oom_rage; + spin_unlock(&memcg->oom_rage_lock); + + if (rage < 0) + return; + + /* + * So, we are in rage. Kill (1 << rage) youngest tasks that are + * as bad as the victim. + */ + read_lock(&tasklist_lock); + list_for_each_entry_reverse(p, &init_task.tasks, tasks) { + unsigned long tsk_points; + unsigned long tsk_overdraft; + + if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) || + fatal_signal_pending(p) || p->flags & PF_EXITING || + oom_unkillable_task(p)) + continue; + + /* + * When mem_cgroup_out_of_memory() and + * p is not member of the group. + */ + if (oc->memcg && !task_in_mem_cgroup(p, oc->memcg)) + continue; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) + continue; + + tsk_points = oom_badness(p, oc->totalpages, &tsk_overdraft); + if (tsk_overdraft < oc->overdraft) + continue; + + /* + * oom_badness never returns a negative value, even if + * oom_score_adj would make badness so, instead it + * returns 1. So we do not kill task with badness 1 if + * the victim has badness > 1 so as not to risk killing + * protected tasks. + */ + if (tsk_points <= 1 && oc->chosen_points > 1) + continue; + + /* + * Consider tasks as equally bad if they have equal + * normalized scores. + */ + if (tsk_points * 1000 / oc->totalpages < + oc->chosen_points * 1000 / oc->totalpages) + continue; + + if (__ratelimit(&berserker_rs)) { + task_lock(p); + pr_err("Rage kill process %d (%s)\n", + task_pid_nr(p), p->comm); + task_unlock(p); + } + + count_vm_event(OOM_KILL); + memcg_memory_event(memcg, MEMCG_OOM_KILL); + + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + + if (++killed >= 1 << rage) + break; + } + read_unlock(&tasklist_lock); + + pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed); +} + static void oom_kill_process(struct oom_control *oc, const char *message) { struct task_struct *victim = oc->chosen; @@ -1012,6 +1118,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) (void *)message); mem_cgroup_put(oom_group); } + oom_berserker(oc); } /* _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel