The logic behind the OOM berserker is the same as in PCS6: if processes are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by default), we increase "rage" (min -10, max 20) and kill 1 << "rage" youngest worst processes if "rage" >= 0.
https://jira.sw.ru/browse/PSBM-17930 Signed-off-by: Vladimir Davydov <[email protected]> --- include/linux/memcontrol.h | 20 ++++++ include/linux/oom.h | 3 + kernel/sysctl.c | 7 ++ mm/memcontrol.c | 11 ---- mm/oom_kill.c | 160 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 189 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c856425ab7a..76a7dc8192fb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -78,6 +78,19 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); + +static inline +bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) +{ + bool ret; + + rcu_read_lock(); + ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); + rcu_read_unlock(); + return ret; +} + int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); @@ -286,6 +299,13 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm return NULL; } +static inline +bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) +{ + return true; +} + static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 4e12187663ed..445f6242ec9e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -35,7 +35,9 @@ struct oom_context { struct task_struct *owner; struct task_struct *victim; unsigned long oom_start; + unsigned long oom_end; unsigned long overdraft; + int rage; wait_queue_head_t waitq; }; @@ -125,4 +127,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; extern int sysctl_panic_on_oom; +extern int sysctl_oom_relaxation; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 976f48c09748..9c081e3f350f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1184,6 +1184,13 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "oom_relaxation", + .data = &sysctl_oom_relaxation, + .maxlen = sizeof(sysctl_oom_relaxation), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 41fb41b16664..2b87dbc5c0cd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1474,17 +1474,6 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return css_is_ancestor(&memcg->css, &root_memcg->css); } -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); - rcu_read_unlock(); - return ret; -} - int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) { int ret; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4290d6665429..9c990571713e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -42,13 +42,18 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; +int sysctl_oom_relaxation = HZ; static DEFINE_SPINLOCK(oom_context_lock); #define OOM_TIMEOUT (5 * HZ) +#define OOM_BASE_RAGE -10 +#define OOM_MAX_RAGE 20 + #ifndef CONFIG_MEMCG struct oom_context oom_ctx = { + .rage = OOM_BASE_RAGE, .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq), }; #endif @@ -58,6 +63,8 @@ void init_oom_context(struct oom_context *ctx) ctx->owner = NULL; ctx->victim = NULL; ctx->oom_start = 0; + ctx->oom_end = 0; + ctx->rage = OOM_BASE_RAGE; init_waitqueue_head(&ctx->waitq); } @@ -485,6 +492,7 @@ void mark_oom_victim(struct task_struct *tsk) */ void exit_oom_victim(void) { + unsigned long now = jiffies; struct mem_cgroup *iter; struct oom_context *ctx; @@ -499,13 +507,143 @@ void exit_oom_victim(void) ctx = mem_cgroup_oom_context(iter); if (ctx->victim == current) { ctx->victim = NULL; - if (!ctx->owner) + if (!ctx->owner) { + ctx->oom_end = now; wake_up_all(&ctx->waitq); + } } } while ((iter = mem_cgroup_iter(NULL, iter, NULL))); spin_unlock(&oom_context_lock); } +static void oom_berserker(struct task_struct *victim, unsigned long totalpages, + struct mem_cgroup *root_memcg, nodemask_t *nodemask) +{ + static DEFINE_RATELIMIT_STATE(berserker_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + unsigned long now = jiffies; + struct mem_cgroup *memcg, *memcg_to_put = NULL, *target_memcg = NULL; + struct oom_context *ctx; + struct task_struct *p; + unsigned long victim_overdraft; + unsigned long victim_points; + int rage = -1; + int killed = 0; + + victim_points = oom_badness(victim, root_memcg, nodemask, totalpages, + &victim_overdraft); + + /* + * Get the victim cgroup. + */ + p = find_lock_task_mm(victim); + if (p) { + memcg = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + /* + * The victim could have been moved to another cgroup + * since it was selected. If so, assume it belonged to + * root_memcg. + */ + if (root_memcg && + !mem_cgroup_same_or_subtree(root_memcg, memcg)) { + mem_cgroup_put(memcg); + memcg = root_memcg; + } else + memcg_to_put = memcg; + } else { + /* + * The victim has already freed its memory, so we can't + * get its score and hence should not start berserker, + * because the latter relies on it. + */ + return; + } + + /* + * Update berserker rage on each oom kill. Select oom context + * with the maximal positive rage if any. + */ + spin_lock(&oom_context_lock); + while (1) { + ctx = mem_cgroup_oom_context(memcg); + if (time_after(now, ctx->oom_end + sysctl_oom_relaxation)) + ctx->rage = OOM_BASE_RAGE; + else if (ctx->rage < OOM_MAX_RAGE) + ctx->rage++; + if (ctx->rage >= rage) { + target_memcg = memcg; + rage = ctx->rage; + } + if (memcg == root_memcg) + break; + memcg = parent_mem_cgroup(memcg); + /* + * Break the loop if there is no parent (i.e. we've just + * done with the root cgroup). Needed for the system + * wide oom case (root_memcg equals NULL). + */ + if (!memcg) + break; + } + spin_unlock(&oom_context_lock); + + if (rage < 0) + goto out; + + /* + * So, we are in rage. Kill (1 << rage) youngest tasks that are + * as bad as the victim. + */ + read_lock(&tasklist_lock); + list_for_each_entry_reverse(p, &init_task.tasks, tasks) { + unsigned long overdraft; + unsigned long points; + + if (p == victim || !p->mm || + fatal_signal_pending(p) || p->flags & PF_EXITING || + oom_unkillable_task(p, target_memcg, nodemask)) + continue; + + points = oom_badness(p, target_memcg, nodemask, totalpages, + &overdraft); + if (overdraft < victim_overdraft) + continue; + + /* + * Consider tasks as equally bad if their score values + * (basically mem+swap usage in pages) differ by less + * than 1/4th. + */ + if (overdraft == victim_overdraft && points < victim_points && + 4 * (victim_points - points) >= victim_points) + continue; + + /* Normalize score for reporting */ + points = points * 1000 / totalpages; + + if (__ratelimit(&berserker_rs)) { + task_lock(p); + pr_err("Rage kill process %d (%s) score %lu\n", + task_pid_nr(p), p->comm, points); + task_unlock(p); + } + + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + mem_cgroup_note_oom_kill(target_memcg, p); + + if (++killed >= 1 << rage) + break; + } + read_unlock(&tasklist_lock); + + pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed); +out: + if (memcg_to_put) + mem_cgroup_put(memcg_to_put); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -617,6 +755,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } rcu_read_unlock(); + oom_berserker(victim, totalpages, memcg, nodemask); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mem_cgroup_note_oom_kill(memcg, victim); put_task_struct(victim); @@ -699,6 +839,7 @@ bool oom_trylock(struct mem_cgroup *memcg) show_stack(p, NULL); ctx->owner = ctx->victim = NULL; + ctx->oom_end = now; wake_up_all(&ctx->waitq); } } while ((iter = mem_cgroup_iter(memcg, iter, NULL))); @@ -734,6 +875,7 @@ void oom_unlock(struct mem_cgroup *memcg) unsigned long now = jiffies; unsigned long timeout = 0; struct mem_cgroup *iter, *victim_memcg = NULL; + struct task_struct *victim = NULL; struct oom_context *ctx; DEFINE_WAIT(wait); @@ -752,6 +894,7 @@ void oom_unlock(struct mem_cgroup *memcg) * It's our responsibility to wake up blocked * processes then. */ + ctx->oom_end = now; wake_up_all(&ctx->waitq); continue; } @@ -766,6 +909,8 @@ void oom_unlock(struct mem_cgroup *memcg) timeout = ctx->oom_start + OOM_TIMEOUT - now; BUG_ON(timeout == 0); + victim = ctx->victim; + /* * Remember victim memcg so that we can wait for victim * to exit below. @@ -776,6 +921,19 @@ void oom_unlock(struct mem_cgroup *memcg) prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE); } while ((iter = mem_cgroup_iter(memcg, iter, NULL))); + /* + * Propagate victim up to the context that initiated oom for + * oom_end to be updated in all relevant contexts when the + * victim exits (see exit_oom_victim). + */ + for (iter = victim_memcg; iter; iter = parent_mem_cgroup(iter)) { + ctx = mem_cgroup_oom_context(iter); + if (!ctx->victim) + ctx->victim = victim; + if (iter == memcg) + break; + } + spin_unlock(&oom_context_lock); if (timeout > 0) { -- 2.1.4 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
