Michal Hocko wrote:
> OK, could you play with the patch/idea suggested in
> http://lkml.kernel.org/r/[email protected]?

I think we don't need to worry about mmap_sem dependency inside __mmput().
Since the OOM killer checks for !MMF_OOM_SKIP mm rather than TIF_MEMDIE thread,
we can keep the OOM killer disabled until we set MMF_OOM_SKIP to the victim's 
mm.
That is, elevating mm_users throughout the reaping procedure does not cause
premature victim selection, even after TIF_MEMDIE is cleared from the victim's
thread. Then, we don't need to use down_write()/up_write() for non OOM victim's 
mm
(nearly 100% of exit_mmap() calls), and can force partial reaping of OOM 
victim's mm
(nearly 0% of exit_mmap() calls) before __mmput() starts doing exit_aio() etc.
Patch is shown below. Only compile tested.

 include/linux/sched/coredump.h |  1 +
 mm/oom_kill.c                  | 80 ++++++++++++++++++++----------------------
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 98ae0d0..6b6237b 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -62,6 +62,7 @@ static inline int get_dumpable(struct mm_struct *mm)
  * on NFS restore
  */
 //#define MMF_EXE_FILE_CHANGED 18      /* see prctl_set_mm_exe_file() */
+#define MMF_OOM_REAPING                18      /* mm is supposed to be reaped 
*/
 
 #define MMF_HAS_UPROBES                19      /* has uprobes */
 #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0e2c925..bdcf658 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -470,38 +470,9 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, 
struct mm_struct *mm)
 {
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
-       bool ret = true;
-
-       /*
-        * We have to make sure to not race with the victim exit path
-        * and cause premature new oom victim selection:
-        * __oom_reap_task_mm           exit_mm
-        *   mmget_not_zero
-        *                                mmput
-        *                                  atomic_dec_and_test
-        *                                exit_oom_victim
-        *                              [...]
-        *                              out_of_memory
-        *                                select_bad_process
-        *                                  # no TIF_MEMDIE task selects new 
victim
-        *  unmap_page_range # frees some memory
-        */
-       mutex_lock(&oom_lock);
 
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               ret = false;
-               goto unlock_oom;
-       }
-
-       /*
-        * increase mm_users only after we know we will reap something so
-        * that the mmput_async is called only when we have reaped something
-        * and delayed __mmput doesn't matter that much
-        */
-       if (!mmget_not_zero(mm)) {
-               up_read(&mm->mmap_sem);
-               goto unlock_oom;
-       }
+       if (!down_read_trylock(&mm->mmap_sem))
+               return false;
 
        /*
         * Tell all users of get_user/copy_from_user etc... that the content
@@ -537,16 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, 
struct mm_struct *mm)
                        K(get_mm_counter(mm, MM_FILEPAGES)),
                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
        up_read(&mm->mmap_sem);
-
-       /*
-        * Drop our reference but make sure the mmput slow path is called from a
-        * different context because we shouldn't risk we get stuck there and
-        * put the oom_reaper out of the way.
-        */
-       mmput_async(mm);
-unlock_oom:
-       mutex_unlock(&oom_lock);
-       return ret;
+       return true;
 }
 
 #define MAX_OOM_REAP_RETRIES 10
@@ -573,9 +535,32 @@ static void oom_reap_task(struct task_struct *tsk)
        /*
         * Hide this mm from OOM killer because it has been either reaped or
         * somebody can't call up_write(mmap_sem).
+        *
+        * Serialize setting of MMF_OOM_SKIP using oom_lock in order to
+        * avoid race with select_bad_process() which causes premature
+        * new oom victim selection.
+        *
+        * The OOM reaper:           An allocating task:
+        *                             Failed get_page_from_freelist().
+        *                             Enters into out_of_memory().
+        *   Reaped memory enough to make get_page_from_freelist() succeed.
+        *   Sets MMF_OOM_SKIP to mm.
+        *                               Enters into select_bad_process().
+        *                                 # MMF_OOM_SKIP mm selects new victim.
         */
+       mutex_lock(&oom_lock);
        set_bit(MMF_OOM_SKIP, &mm->flags);
+       mutex_unlock(&oom_lock);
 
+       /*
+        * Drop our reference but make sure the mmput slow path is called from a
+        * different context because we shouldn't risk we get stuck there and
+        * put the oom_reaper out of the way.
+        */
+       if (test_bit(MMF_OOM_REAPING, &mm->flags)) {
+               clear_bit(MMF_OOM_REAPING, &mm->flags);
+               mmput_async(mm);
+       }
        /* Drop a reference taken by wake_oom_reaper */
        put_task_struct(tsk);
 }
@@ -658,6 +643,13 @@ static void mark_oom_victim(struct task_struct *tsk)
        if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
                mmgrab(tsk->signal->oom_mm);
 
+#ifdef CONFIG_MMU
+       if (!test_bit(MMF_OOM_REAPING, &mm->flags)) {
+               set_bit(MMF_OOM_REAPING, &mm->flags);
+               mmget(mm);
+       }
+#endif
+
        /*
         * Make sure that the task is woken up from uninterruptible sleep
         * if it is frozen because OOM killer wouldn't be able to free
@@ -913,6 +905,12 @@ static void oom_kill_process(struct oom_control *oc, const 
char *message)
                if (is_global_init(p)) {
                        can_oom_reap = false;
                        set_bit(MMF_OOM_SKIP, &mm->flags);
+#ifdef CONFIG_MMU
+                       if (test_bit(MMF_OOM_REAPING, &mm->flags)) {
+                               clear_bit(MMF_OOM_REAPING, &mm->flags);
+                               mmput_async(mm);
+                       }
+#endif
                        pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                        task_pid_nr(victim), victim->comm,
                                        task_pid_nr(p), p->comm);

Reply via email to