This is a serious cause of mmap_sem contention.  MAP_POPULATE
and MCL_FUTURE, in particular, are disastrous in multithreaded programs.

Signed-off-by: Andy Lutomirski <l...@amacapital.net>
---

Changes from v1:

The non-unlocking versions of do_mmap_pgoff and mmap_region are still
available for aio_setup_ring's benefit.  In theory, aio_setup_ring
would do better with a lock-downgrading version, but that would be
somewhat ugly and doesn't help my workload.

 arch/tile/mm/elf.c |  9 +++---
 fs/aio.c           |  4 +++
 include/linux/mm.h | 19 ++++++++++--
 ipc/shm.c          |  6 ++--
 mm/fremap.c        | 10 ++++--
 mm/mmap.c          | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
 mm/util.c          |  3 +-
 7 files changed, 117 insertions(+), 23 deletions(-)

diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98b..a0441f2 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -129,12 +129,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
         */
        if (!retval) {
                unsigned long addr = MEM_USER_INTRPT;
-               addr = mmap_region(NULL, addr, INTRPT_SIZE,
-                                  MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
-                                  VM_READ|VM_EXEC|
-                                  VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+               addr = mmap_region_unlock(NULL, addr, INTRPT_SIZE,
+                                         MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
+                                         VM_READ|VM_EXEC|
+                                         VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
                if (addr > (unsigned long) -PAGE_SIZE)
                        retval = (int) addr;
+               return retval;  /* We already unlocked mmap_sem. */
        }
 #endif
 
diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..253396c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -127,6 +127,10 @@ static int aio_setup_ring(struct kioctx *ctx)
        info->mmap_size = nr_pages * PAGE_SIZE;
        dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
        down_write(&ctx->mm->mmap_sem);
+       /*
+        * XXX: If MCL_FUTURE is set, this will hold mmap_sem for write for
+        *      longer than necessary.
+        */
        info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
                                        PROT_READ|PROT_WRITE,
                                        MAP_ANONYMOUS|MAP_PRIVATE, 0);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e..139f636 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,14 +1441,27 @@ extern int install_special_mapping(struct mm_struct *mm,
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned 
long, unsigned long, unsigned long);
 
+/* These must be called with mmap_sem held for write. */
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff(struct file *, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flags, unsigned long pgoff);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+/*
+ * These must be called with mmap_sem held for write, and they will release
+ * mmap_sem before they return.  They hold mmap_sem for a shorter time than
+ * the non-unlocking variants.
+ */
+extern unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long flags,
+       vm_flags_t vm_flags, unsigned long pgoff);
+extern unsigned long do_mmap_pgoff_unlock(struct file *, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flags, unsigned long pgoff);
+
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9..d0001c8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1068,12 +1068,14 @@ long do_shmat(int shmid, char __user *shmaddr, int 
shmflg, ulong *raddr,
                    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
                        goto invalid;
        }
-               
-       user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
+
+       user_addr = do_mmap_pgoff_unlock(file, addr, size, prot, flags, 0);
        *raddr = user_addr;
        err = 0;
        if (IS_ERR_VALUE(user_addr))
                err = (long)user_addr;
+       goto out_fput;
+
 invalid:
        up_write(&current->mm->mmap_sem);
 
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e..7ebe0a4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -200,8 +200,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, 
unsigned long, size,
                        struct file *file = get_file(vma->vm_file);
 
                        flags &= MAP_NONBLOCK;
-                       addr = mmap_region(file, start, size,
-                                       flags, vma->vm_flags, pgoff);
+                       addr = mmap_region_unlock(file, start, size,
+                                                 flags, vma->vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -209,7 +209,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, 
unsigned long, size,
                                BUG_ON(addr != start);
                                err = 0;
                        }
-                       goto out;
+                       return err;  /* We already unlocked. */
                }
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
@@ -237,6 +237,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, 
unsigned long, size,
                        /*
                         * might be mapping previously unmapped range of file
                         */
+                       if (unlikely(has_write_lock)) {
+                               downgrade_write(&mm->mmap_sem);
+                               has_write_lock = 0;
+                       }
                        mlock_vma_pages_range(vma, start, start + size);
                } else {
                        if (unlikely(has_write_lock)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c4..2dd6b2f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -995,13 +995,22 @@ static inline unsigned long round_hint_to_min(unsigned 
long hint)
        return hint;
 }
 
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+                                       unsigned long len, unsigned long flags,
+                                       vm_flags_t vm_flags,
+                                        unsigned long pgoff, int *downgraded);
+
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
+ *
+ * If downgraded is non-null, do_mmap_pgoff_helper may downgrade mmap_sem
+ * and write 1 to *downgraded.
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
-                       unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff)
+static unsigned long do_mmap_pgoff_helper(
+       struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flags, unsigned long pgoff,
+       int *downgraded)
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
@@ -1127,7 +1136,32 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned 
long addr,
                }
        }
 
-       return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+       return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff,
+                                 downgraded);
+}
+
+
+unsigned long do_mmap_pgoff_unlock(struct file *file, unsigned long addr,
+                                  unsigned long len, unsigned long prot,
+                                  unsigned long flags, unsigned long pgoff)
+{
+       int downgraded = 0;
+       unsigned long ret = do_mmap_pgoff_helper(file, addr, len,
+               prot, flags, pgoff, &downgraded);
+
+       if (downgraded)
+               up_read(&current->mm->mmap_sem);
+       else
+               up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+                           unsigned long len, unsigned long prot,
+                           unsigned long flags, unsigned long pgoff)
+{
+       return do_mmap_pgoff_helper(file, addr, len, prot, flags, pgoff, 0);
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1240,9 +1274,14 @@ static inline int accountable_mapping(struct file *file, 
vm_flags_t vm_flags)
        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
-unsigned long mmap_region(struct file *file, unsigned long addr,
-                         unsigned long len, unsigned long flags,
-                         vm_flags_t vm_flags, unsigned long pgoff)
+/*
+ * If downgraded is null, then mmap_sem won't be touched.  Otherwise it
+ * may be downgraded, in which case *downgraded will be set to 1.
+ */
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+                                       unsigned long len, unsigned long flags,
+                                       vm_flags_t vm_flags,
+                                       unsigned long pgoff, int *downgraded)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1373,10 +1412,19 @@ out:
 
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
+               if (downgraded) {
+                       downgrade_write(&mm->mmap_sem);
+                       *downgraded = 1;
+               }
                if (!mlock_vma_pages_range(vma, addr, addr + len))
                        mm->locked_vm += (len >> PAGE_SHIFT);
-       } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+       } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) {
+               if (downgraded) {
+                       downgrade_write(&mm->mmap_sem);
+                       *downgraded = 1;
+               }
                make_pages_present(addr, addr + len);
+       }
 
        if (file)
                uprobe_mmap(vma);
@@ -1400,6 +1448,29 @@ unacct_error:
        return error;
 }
 
+unsigned long mmap_region(struct file *file, unsigned long addr,
+                         unsigned long len, unsigned long flags,
+                         vm_flags_t vm_flags, unsigned long pgoff)
+{
+       return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff, 0);
+}
+
+unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+                                unsigned long len, unsigned long flags,
+                                vm_flags_t vm_flags, unsigned long pgoff)
+{
+       int downgraded = 0;
+       unsigned long ret = mmap_region_helper(file, addr, len,
+               flags, vm_flags, pgoff, &downgraded);
+
+       if (downgraded)
+               up_read(&current->mm->mmap_sem);
+       else
+               up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
diff --git a/mm/util.c b/mm/util.c
index dc3036c..fd54884 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -359,8 +359,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned 
long addr,
        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                down_write(&mm->mmap_sem);
-               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
-               up_write(&mm->mmap_sem);
+               ret = do_mmap_pgoff_unlock(file, addr, len, prot, flag, pgoff);
        }
        return ret;
 }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to