From: Vladimir Davydov <[email protected]>

Patchset description: oom enhancements - part 2

 - Patches 1-2 prepare memcg for upcoming changes in oom design.
 - Patch 3 reworks oom locking design so that the executioner waits for
   victim to exit. This is necessary to increase oom kill rate, which is
   essential for berserker mode.
 - Patch 4 drops unused OOM_SCAN_ABORT
 - Patch 5 introduces oom timeout.
   https://jira.sw.ru/browse/PSBM-38581
 - Patch 6 makes oom fairer when it comes to selecting a victim among
   different containers.
   https://jira.sw.ru/browse/PSBM-37915
 - Patch 7 prepares oom for introducing berserker mode
 - Patch 8 resurrects oom berserker mode, which is supposed to cope with
   actively forking processes.
   https://jira.sw.ru/browse/PSBM-17930

https://jira.sw.ru/browse/PSBM-26973

Changes in v3:
 - rework oom_trylock (patch 3)
 - select exiting process instead of aborting oom scan so as not to keep
   busy-waiting for an exiting process to exit (patches 3, 4)
 - cleanup oom timeout handling + fix stuck process trace dumped
   multiple times on timeout (patch 5)
 - set max_overdraft to ULONG_MAX on selected processes (patch 6)
 - rework oom berserker process selection logic (patches 7, 8)

Changes in v2:
 - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4)
 - propagate victim to the context that initiated oom in oom_unlock
   (patch 6)
 - always set oom_end on releasing oom context (patch 6)

Vladimir Davydov (8):
  memcg: add mem_cgroup_get/put helpers
  memcg: add lock for protecting memcg->oom_notify list
  oom: rework locking design
  oom: introduce oom timeout
  oom: drop OOM_SCAN_ABORT
  oom: rework logic behind memory.oom_guarantee
  oom: pass points and overdraft to oom_kill_process
  oom: resurrect berserker mode

Reviewed-by: Kirill Tkhai <[email protected]>

=========================================
This patch description:

Currently, we won't select a new oom victim until the previous one has
passed away. This might lead to a deadlock if an allocating task holds a
lock needed by the victim to complete. To cope with this problem, this
patch introduced oom timeout, after which a new task will be selected
even if the previous victim hasn't died. The timeout is hard-coded,
equals 5 seconds.

https://jira.sw.ru/browse/PSBM-38581

Signed-off-by: Vladimir Davydov <[email protected]>
---
 include/linux/oom.h |  2 ++
 mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index e19385dd29aa..f804551c0a5d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,6 +34,8 @@ enum oom_scan_t {
 struct oom_context {
        struct task_struct *owner;
        struct task_struct *victim;
+       bool marked;
+       unsigned long oom_start;
        wait_queue_head_t waitq;
 };
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e50621b39f39..fd6defa7c6de 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,8 @@ int sysctl_oom_dump_tasks;
 
 static DEFINE_SPINLOCK(oom_context_lock);
 
+#define OOM_TIMEOUT    (5 * HZ)
+
 #ifndef CONFIG_MEMCG
 struct oom_context oom_ctx = {
        .waitq          = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
@@ -55,6 +57,8 @@ void init_oom_context(struct oom_context *ctx)
 {
        ctx->owner = NULL;
        ctx->victim = NULL;
+       ctx->marked = false;
+       ctx->oom_start = 0;
        init_waitqueue_head(&ctx->waitq);
 }
 
@@ -62,6 +66,7 @@ static void __release_oom_context(struct oom_context *ctx)
 {
        ctx->owner = NULL;
        ctx->victim = NULL;
+       ctx->marked = false;
        wake_up_all(&ctx->waitq);
 }
 
@@ -291,11 +296,14 @@ enum oom_scan_t oom_scan_process_thread(struct 
task_struct *task,
 
        /*
         * This task already has access to memory reserves and is being killed.
-        * Don't allow any other task to have access to the reserves.
+        * Try to select another one.
+        *
+        * This can only happen if oom_trylock timeout-ed, which most probably
+        * means that the victim had dead-locked.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
                if (!force_kill)
-                       return OOM_SCAN_ABORT;
+                       return OOM_SCAN_CONTINUE;
        }
        if (!task->mm)
                return OOM_SCAN_CONTINUE;
@@ -463,8 +471,10 @@ void mark_oom_victim(struct task_struct *tsk)
        memcg = try_get_mem_cgroup_from_mm(tsk->mm);
        ctx = mem_cgroup_oom_context(memcg);
        spin_lock(&oom_context_lock);
-       if (!ctx->victim)
+       if (!ctx->victim) {
                ctx->victim = tsk;
+               ctx->marked = true;
+       }
        spin_unlock(&oom_context_lock);
        mem_cgroup_put(memcg);
 }
@@ -499,21 +509,26 @@ void exit_oom_victim(void)
 
 static void __wait_oom_context(struct oom_context *ctx)
 {
+       unsigned long now = jiffies;
+       unsigned long timeout;
        DEFINE_WAIT(wait);
 
-       if (ctx->victim == current) {
+       if (ctx->victim == current ||
+           time_after_eq(now, ctx->oom_start + OOM_TIMEOUT)) {
                spin_unlock(&oom_context_lock);
                return;
        }
 
        prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+       timeout = ctx->oom_start + OOM_TIMEOUT - now;
        spin_unlock(&oom_context_lock);
-       schedule();
+       schedule_timeout(timeout);
        finish_wait(&ctx->waitq, &wait);
 }
 
 bool oom_trylock(struct mem_cgroup *memcg)
 {
+       unsigned long now = jiffies;
        struct mem_cgroup *iter;
        struct oom_context *ctx;
 
@@ -528,10 +543,32 @@ bool oom_trylock(struct mem_cgroup *memcg)
        iter = mem_cgroup_iter(memcg, NULL, NULL);
        do {
                ctx = mem_cgroup_oom_context(iter);
-               if (ctx->owner || ctx->victim) {
+               if ((ctx->owner || ctx->victim) &&
+                   time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
                        __wait_oom_context(ctx);
                        mem_cgroup_iter_break(memcg, iter);
                        return false;
+               } else if (ctx->owner || ctx->victim) {
+                       /*
+                        * Timeout. Release the context and dump stack
+                        * trace of the stuck process.
+                        *
+                        * To avoid dumping stack trace of the same task
+                        * more than once, we mark the context that
+                        * contained the victim when it was killed (see
+                        * mark_oom_victim).
+                        */
+                       struct task_struct *p = ctx->victim;
+
+                       if (p && ctx->marked) {
+                               task_lock(p);
+                               pr_err("OOM kill timeout: %d (%s)\n",
+                                      task_pid_nr(p), p->comm);
+                               task_unlock(p);
+                               show_stack(p, NULL);
+                       }
+
+                       __release_oom_context(ctx);
                }
        } while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
@@ -544,6 +581,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
                BUG_ON(ctx->owner);
                BUG_ON(ctx->victim);
                ctx->owner = current;
+               ctx->oom_start = now;
        } while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
        spin_unlock(&oom_context_lock);
@@ -565,7 +603,11 @@ void oom_unlock(struct mem_cgroup *memcg)
        iter = mem_cgroup_iter(memcg, NULL, NULL);
        do {
                ctx = mem_cgroup_oom_context(iter);
-               BUG_ON(ctx->owner != current);
+               if (ctx->owner != current) {
+                       /* Lost ownership on timeout */
+                       mem_cgroup_iter_break(memcg, iter);
+                       break;
+               }
                if (ctx->victim) {
                        victim = ctx->victim;
                        /*
@@ -598,7 +640,9 @@ void oom_unlock(struct mem_cgroup *memcg)
        iter = mem_cgroup_iter(memcg, NULL, NULL);
        do {
                ctx = mem_cgroup_oom_context(iter);
-               BUG_ON(ctx->owner != current);
+               if (ctx->owner != current)
+                       /* Lost ownership on timeout */
+                       continue;
                if (!ctx->victim)
                        /*
                         * Victim already exited or nobody was killed in
-- 
2.1.4

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to