Currently, we won't select a new oom victim until the previous one has
passed away. This might lead to a deadlock if an allocating task holds a
lock needed by the victim to complete. To cope with this problem, this
patch introduced oom timeout, after which a new task will be selected
even if the previous victim hasn't died. The timeout is hard-coded,
equals 5 seconds.

https://jira.sw.ru/browse/PSBM-38581

Signed-off-by: Vladimir Davydov <[email protected]>
---
 include/linux/oom.h |  1 +
 mm/oom_kill.c       | 48 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index e19385dd29aa..caeda102e7ba 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,6 +34,7 @@ enum oom_scan_t {
 struct oom_context {
        struct task_struct *owner;
        struct task_struct *victim;
+       unsigned long oom_start;
        wait_queue_head_t waitq;
 };
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c9265092825d..bf101f9242ff 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,8 @@ int sysctl_oom_dump_tasks;
 
 static DEFINE_SPINLOCK(oom_context_lock);
 
+#define OOM_TIMEOUT    (5 * HZ)
+
 #ifndef CONFIG_MEMCG
 struct oom_context oom_ctx = {
        .waitq          = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
@@ -55,6 +57,7 @@ void init_oom_context(struct oom_context *ctx)
 {
        ctx->owner = NULL;
        ctx->victim = NULL;
+       ctx->oom_start = 0;
        init_waitqueue_head(&ctx->waitq);
 }
 
@@ -286,11 +289,14 @@ enum oom_scan_t oom_scan_process_thread(struct 
task_struct *task,
 
        /*
         * This task already has access to memory reserves and is being killed.
-        * Don't allow any other task to have access to the reserves.
+        * Try to select another one.
+        *
+        * This can only happen if oom_trylock timeout-ed, which most probably
+        * means that the victim had dead-locked.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
                if (!force_kill)
-                       return OOM_SCAN_ABORT;
+                       return OOM_SCAN_CONTINUE;
        }
        if (!task->mm)
                return OOM_SCAN_CONTINUE;
@@ -645,6 +651,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
 bool oom_trylock(struct mem_cgroup *memcg)
 {
+       unsigned long now = jiffies;
        struct mem_cgroup *iter;
        struct oom_context *ctx;
        DEFINE_WAIT(wait);
@@ -660,14 +667,29 @@ bool oom_trylock(struct mem_cgroup *memcg)
        iter = mem_cgroup_iter(memcg, NULL, NULL);
        do {
                ctx = mem_cgroup_oom_context(iter);
-               if (ctx->owner || ctx->victim) {
+               if ((ctx->owner || ctx->victim) &&
+                   time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
                        prepare_to_wait(&ctx->waitq, &wait,
                                        TASK_KILLABLE);
                        spin_unlock(&oom_context_lock);
-                       schedule();
+                       schedule_timeout(ctx->oom_start + OOM_TIMEOUT - now);
                        finish_wait(&ctx->waitq, &wait);
                        mem_cgroup_iter_break(memcg, iter);
                        return false;
+               } else if (ctx->owner || ctx->victim) {
+                       /*
+                        * Timeout. Release the context.
+                        */
+                       struct task_struct *p = ctx->victim ?: ctx->owner;
+
+                       task_lock(p);
+                       pr_err("OOM kill timeout: %d (%s)\n",
+                              task_pid_nr(p), p->comm);
+                       task_unlock(p);
+                       show_stack(p, NULL);
+
+                       ctx->owner = ctx->victim = NULL;
+                       wake_up_all(&ctx->waitq);
                }
        } while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
@@ -680,6 +702,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
                BUG_ON(ctx->owner);
                BUG_ON(ctx->victim);
                ctx->owner = current;
+               ctx->oom_start = now;
        } while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
        spin_unlock(&oom_context_lock);
@@ -689,9 +712,10 @@ bool oom_trylock(struct mem_cgroup *memcg)
 
 void oom_unlock(struct mem_cgroup *memcg)
 {
+       unsigned long now = jiffies;
+       unsigned long timeout = 0;
        struct mem_cgroup *iter, *victim_memcg = NULL;
        struct oom_context *ctx;
-       bool need_to_wait = false;
        DEFINE_WAIT(wait);
 
        spin_lock(&oom_context_lock);
@@ -714,8 +738,15 @@ void oom_unlock(struct mem_cgroup *memcg)
                }
 
                /* Already waiting? */
-               if (need_to_wait)
+               if (timeout > 0)
                        continue;
+               /* Expired? */
+               if (time_after(now, ctx->oom_start + OOM_TIMEOUT))
+                       continue;
+
+               timeout = ctx->oom_start + OOM_TIMEOUT - now;
+               BUG_ON(timeout == 0);
+
                /*
                 * Remember victim memcg so that we can wait for victim
                 * to exit below.
@@ -724,13 +755,12 @@ void oom_unlock(struct mem_cgroup *memcg)
                mem_cgroup_get(iter);
 
                prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
-               need_to_wait = true;
        } while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
 
        spin_unlock(&oom_context_lock);
 
-       if (need_to_wait) {
-               schedule();
+       if (timeout > 0) {
+               schedule_timeout(timeout);
                ctx = mem_cgroup_oom_context(victim_memcg);
                finish_wait(&ctx->waitq, &wait);
                mem_cgroup_put(victim_memcg);
-- 
2.1.4

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to