[RFC][PATCH] mm: ksm: add MAP_MERGEABLE mmap() as a KSM shortcut

2014-02-12 Thread Dave Hansen

We are starting to see substantial amounts (seconds) of latency
being incurred by users of mmap_sem in the worst case.  It is
very common to see them spike up in to the tens-of-ms range.  Any
acquisition, especially for write, is a potential problem.

The aggravating factor here is that we have been encouraging
folks to be "polite" to the VM and do things like: call
MADV_DONTNEED, unmap when you're done using things, and use KSM.
All of these things take mmap_sem().  JVMs are starting to put
nuggets like this in their generic malloc() functions:

addr = mmap(foo_bytes, ...);
madvise(MADV_MERGABLE, addr, foo_bytes);

That means that every single malloc() call does at _least_ two
write acquisitions of mmap_sem.  We can try to batch these things
in userspace more, of course, but this is becoming a very common
pattern.  We should allow a shortcut.

I'm a little concerned that we might be in the middle of
constructing the VMA when we make the decision to set
VM_MERGEABLE and miss one of the "bad" flags.  I've sprinkled a
few VM_BUG_ON()s to watch out for any cases where we've missed
something.  I turned this on for _every_ VMA to test it, and it
hasn't blown up yet.

There are probably some other ways to do this.  We could have
prctl, or some kind of boot option, or even something analogous
to the transparent-huge-page 'always' option (as opposed to
madvise()).  We could even extend madvise() for this kind of
thing.  We could allow MADV_MERGEABLE to be specified for
unmapped areas in _advance_ for when brk() or mmap() is called
on them.

Applying transactional memory to mmap_sem would probably also
help out here a lot.

Cc: ar...@linux.intel.com
Cc: Andi Kleen 

---

 b/Documentation/vm/ksm.txt|5 +++-
 b/include/linux/ksm.h |   13 +++
 b/include/uapi/asm-generic/mman.h |1 
 b/mm/ksm.c|   43 +++---
 b/mm/mmap.c   |   13 +++
 5 files changed, 58 insertions(+), 17 deletions(-)

diff -puN include/uapi/asm-generic/mman.h~mmap-flag-for-ksm 
include/uapi/asm-generic/mman.h
--- a/include/uapi/asm-generic/mman.h~mmap-flag-for-ksm 2014-02-12 
13:13:15.496938731 -0800
+++ b/include/uapi/asm-generic/mman.h   2014-02-12 13:13:15.502939003 -0800
@@ -12,6 +12,7 @@
 #define MAP_NONBLOCK   0x1 /* do not block on IO */
 #define MAP_STACK  0x2 /* give out an address that is best 
suited for process/thread stacks */
 #define MAP_HUGETLB0x4 /* create a huge page mapping */
+#define MAP_MERGEABLE  0x8 /* mark mapping as mergeable by KSM */
 
 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
 
diff -puN mm/mmap.c~mmap-flag-for-ksm mm/mmap.c
--- a/mm/mmap.c~mmap-flag-for-ksm   2014-02-12 13:13:15.497938776 -0800
+++ b/mm/mmap.c 2014-02-12 13:20:33.725852533 -0800
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1361,6 +1362,18 @@ unsigned long do_mmap_pgoff(struct file
vm_flags |= VM_NORESERVE;
}
 
+   /*
+* This *must* happen after all the other vm_flags have
+* been set, but before we make the decision about
+* whether this vma can be merged with another.
+*/
+   if ((flags & MAP_MERGEABLE) && ksm_can_handle_vma(vm_flags)) {
+   int err = ksm_enter_if_new(mm);
+   if (err)
+   return err;
+   vm_flags |= VM_MERGEABLE;
+   }
+
addr = mmap_region(file, addr, len, vm_flags, pgoff);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
diff -puN mm/ksm.c~mmap-flag-for-ksm mm/ksm.c
--- a/mm/ksm.c~mmap-flag-for-ksm2014-02-12 13:13:15.498938822 -0800
+++ b/mm/ksm.c  2014-02-12 13:20:33.726852579 -0800
@@ -419,6 +419,7 @@ static struct vm_area_struct *find_merge
return NULL;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
return NULL;
+   VM_BUG_ON(!ksm_can_handle_vma(vma->vm_flags));
return vma;
 }
 
@@ -785,6 +786,7 @@ static int unmerge_and_remove_all_rmap_i
break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
+   VM_BUG_ON(!ksm_can_handle_vma(vma->vm_flags));
err = unmerge_ksm_pages(vma,
vma->vm_start, vma->vm_end);
if (err)
@@ -1024,6 +1026,7 @@ static int try_to_merge_one_page(struct
 
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+   VM_BUG_ON(!ksm_can_handle_vma(vma->vm_flags));
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
goto out;
BUG_ON(PageTransCompound(page));
@@ -1607,6 +1610,7 @@ next_mm:
for (; vma; vma = vma->vm_next) {
  

[RFC][PATCH] mm: ksm: add MAP_MERGEABLE mmap() as a KSM shortcut

2014-02-12 Thread Dave Hansen

We are starting to see substantial amounts (seconds) of latency
being incurred by users of mmap_sem in the worst case.  It is
very common to see them spike up in to the tens-of-ms range.  Any
acquisition, especially for write, is a potential problem.

The aggravating factor here is that we have been encouraging
folks to be polite to the VM and do things like: call
MADV_DONTNEED, unmap when you're done using things, and use KSM.
All of these things take mmap_sem().  JVMs are starting to put
nuggets like this in their generic malloc() functions:

addr = mmap(foo_bytes, ...);
madvise(MADV_MERGABLE, addr, foo_bytes);

That means that every single malloc() call does at _least_ two
write acquisitions of mmap_sem.  We can try to batch these things
in userspace more, of course, but this is becoming a very common
pattern.  We should allow a shortcut.

I'm a little concerned that we might be in the middle of
constructing the VMA when we make the decision to set
VM_MERGEABLE and miss one of the bad flags.  I've sprinkled a
few VM_BUG_ON()s to watch out for any cases where we've missed
something.  I turned this on for _every_ VMA to test it, and it
hasn't blown up yet.

There are probably some other ways to do this.  We could have
prctl, or some kind of boot option, or even something analogous
to the transparent-huge-page 'always' option (as opposed to
madvise()).  We could even extend madvise() for this kind of
thing.  We could allow MADV_MERGEABLE to be specified for
unmapped areas in _advance_ for when brk() or mmap() is called
on them.

Applying transactional memory to mmap_sem would probably also
help out here a lot.

Cc: ar...@linux.intel.com
Cc: Andi Kleen andi.kl...@intel.com

---

 b/Documentation/vm/ksm.txt|5 +++-
 b/include/linux/ksm.h |   13 +++
 b/include/uapi/asm-generic/mman.h |1 
 b/mm/ksm.c|   43 +++---
 b/mm/mmap.c   |   13 +++
 5 files changed, 58 insertions(+), 17 deletions(-)

diff -puN include/uapi/asm-generic/mman.h~mmap-flag-for-ksm 
include/uapi/asm-generic/mman.h
--- a/include/uapi/asm-generic/mman.h~mmap-flag-for-ksm 2014-02-12 
13:13:15.496938731 -0800
+++ b/include/uapi/asm-generic/mman.h   2014-02-12 13:13:15.502939003 -0800
@@ -12,6 +12,7 @@
 #define MAP_NONBLOCK   0x1 /* do not block on IO */
 #define MAP_STACK  0x2 /* give out an address that is best 
suited for process/thread stacks */
 #define MAP_HUGETLB0x4 /* create a huge page mapping */
+#define MAP_MERGEABLE  0x8 /* mark mapping as mergeable by KSM */
 
 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
 
diff -puN mm/mmap.c~mmap-flag-for-ksm mm/mmap.c
--- a/mm/mmap.c~mmap-flag-for-ksm   2014-02-12 13:13:15.497938776 -0800
+++ b/mm/mmap.c 2014-02-12 13:20:33.725852533 -0800
@@ -36,6 +36,7 @@
 #include linux/sched/sysctl.h
 #include linux/notifier.h
 #include linux/memory.h
+#include linux/ksm.h
 
 #include asm/uaccess.h
 #include asm/cacheflush.h
@@ -1361,6 +1362,18 @@ unsigned long do_mmap_pgoff(struct file
vm_flags |= VM_NORESERVE;
}
 
+   /*
+* This *must* happen after all the other vm_flags have
+* been set, but before we make the decision about
+* whether this vma can be merged with another.
+*/
+   if ((flags  MAP_MERGEABLE)  ksm_can_handle_vma(vm_flags)) {
+   int err = ksm_enter_if_new(mm);
+   if (err)
+   return err;
+   vm_flags |= VM_MERGEABLE;
+   }
+
addr = mmap_region(file, addr, len, vm_flags, pgoff);
if (!IS_ERR_VALUE(addr) 
((vm_flags  VM_LOCKED) ||
diff -puN mm/ksm.c~mmap-flag-for-ksm mm/ksm.c
--- a/mm/ksm.c~mmap-flag-for-ksm2014-02-12 13:13:15.498938822 -0800
+++ b/mm/ksm.c  2014-02-12 13:20:33.726852579 -0800
@@ -419,6 +419,7 @@ static struct vm_area_struct *find_merge
return NULL;
if (!(vma-vm_flags  VM_MERGEABLE) || !vma-anon_vma)
return NULL;
+   VM_BUG_ON(!ksm_can_handle_vma(vma-vm_flags));
return vma;
 }
 
@@ -785,6 +786,7 @@ static int unmerge_and_remove_all_rmap_i
break;
if (!(vma-vm_flags  VM_MERGEABLE) || !vma-anon_vma)
continue;
+   VM_BUG_ON(!ksm_can_handle_vma(vma-vm_flags));
err = unmerge_ksm_pages(vma,
vma-vm_start, vma-vm_end);
if (err)
@@ -1024,6 +1026,7 @@ static int try_to_merge_one_page(struct
 
if (!(vma-vm_flags  VM_MERGEABLE))
goto out;
+   VM_BUG_ON(!ksm_can_handle_vma(vma-vm_flags));
if (PageTransCompound(page)  page_trans_compound_anon_split(page))
goto out;
BUG_ON(PageTransCompound(page));