From: Mel Gorman <[EMAIL PROTECTED]>

A region created for morecore is prefaulted to ensure sufficient hugepages
exist in the pool.  This was necessary on kernels older than 2.6.27-rc1 as
MAP_PRIVATE mappings do not reserve pages in advance. Prefaulting is the
only means of ensuring an application does not receive a SIGBUS due to a
failed fault.

The get_huge_pages() API has similar requirements in that it needs to
prefault a region to ensure future faults succeed. This patch splits out
prefaulting into a separate utility function so it can be used by
get_huge_pages().

Signed-off-by: Mel Gorman <[EMAIL PROTECTED]>
---
 hugeutils.c             |   47 ++++++++++++++++++++++++++++++++++++++++++
 libhugetlbfs_internal.h |    1 +
 morecore.c              |   52 +++++++---------------------------------------
 3 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/hugeutils.c b/hugeutils.c
index cc5113f..995ccc6 100644
--- a/hugeutils.c
+++ b/hugeutils.c
@@ -38,6 +38,7 @@
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/file.h>
+#include <sys/uio.h>
 
 #include "libhugetlbfs_internal.h"
 #include "hugetlbfs.h"
@@ -267,6 +268,52 @@ int hugetlbfs_unlinked_fd(void)
        return fd;
 }
 
+#define IOV_LEN 64
+int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length)
+{
+       /*
+        * The NUMA users of libhugetlbfs' malloc feature are
+        * expected to use the numactl program to specify an
+        * appropriate policy for hugepage allocation
+        *
+        * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT
+        * is set. If we instead returned a hugepage mapping with insufficient
+        * hugepages, the VM system would kill the process when the
+        * process tried to access the missing memory.
+        *
+        * The value of this environment variable is read during library
+        * initialisation and sets __hugetlbfs_prefault accordingly. If 
+        * prefaulting is enabled and we can't get all that were requested,
+        * -ENOMEM is returned. The caller is expected to release the entire
+        * mapping and optionally it may recover by mapping base pages instead.
+        */
+       if (__hugetlbfs_prefault) {
+               int i;
+               size_t offset;
+               struct iovec iov[IOV_LEN];
+               int ret;
+
+               for (offset = 0; offset < length; ) {
+                       for (i = 0; i < IOV_LEN && offset < length; i++) {
+                               iov[i].iov_base = addr + offset;
+                               iov[i].iov_len = 1;
+                               offset += gethugepagesize();
+                       }
+                       ret = readv(fd, iov, i);
+                       if (ret != i) {
+                               DEBUG("Got %d of %d requested; err=%d\n", ret,
+                                               i, ret < 0 ? errno : 0);
+                               WARNING("Failed to reserve %ld huge pages "
+                                               "for new region\n",
+                                               length / gethugepagesize());
+                               return -ENOMEM;
+                       }
+               }
+       }
+
+       return 0;
+}
+
 /********************************************************************/
 /* Library user visible DIAGNOSES/DEBUGGING ONLY functions          */
 /********************************************************************/
diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h
index 595cc6e..ce4c23a 100644
--- a/libhugetlbfs_internal.h
+++ b/libhugetlbfs_internal.h
@@ -47,6 +47,7 @@ extern void __hugetlbfs_setup_elflink();
 extern void __hugetlbfs_setup_morecore();
 extern void __hugetlbfs_setup_debug();
 extern char __hugetlbfs_hostname[];
+extern int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length);
 
 #ifndef REPORT
 #define REPORT(level, prefix, format, ...) \
diff --git a/morecore.c b/morecore.c
index 46897aa..6712207 100644
--- a/morecore.c
+++ b/morecore.c
@@ -28,7 +28,6 @@
 #include <dlfcn.h>
 #include <string.h>
 #include <fcntl.h>
-#include <sys/uio.h>
 
 #include "hugetlbfs.h"
 
@@ -37,7 +36,6 @@
 static int heap_fd;
 static int shrink_ok;          /* default = 0; no shrink */
 static int zero_fd;
-static long blocksize;
 
 static void *heapbase;
 static void *heaptop;
@@ -69,13 +67,8 @@ static long hugetlbfs_next_addr(long addr)
  * Luckily, if it does not do so and we error out malloc will happily
  * go back to small pages and use mmap to get them.  Hurrah.
  */
-#define IOV_LEN        64
-
 static void *hugetlbfs_morecore(ptrdiff_t increment)
 {
-       unsigned long offset;
-       int i;
-       struct iovec iov[IOV_LEN];
        int ret;
        void *p;
        long delta;
@@ -92,7 +85,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
              heapbase, heaptop, mapsize, delta);
 
        /* align to multiple of hugepagesize. */
-       delta = ALIGN(delta, blocksize);
+       delta = ALIGN(delta, gethugepagesize());
 
        if (delta > 0) {
                /* growing the heap */
@@ -128,38 +121,10 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
                        return NULL;
                }
 
-               /* The NUMA users of libhugetlbfs' malloc feature are
-                * expected to use the numactl program to specify an
-                * appropriate policy for hugepage allocation */
-
-               /*
-                * Use readv(2) to instantiate the hugepages.  If we
-                * can't get all that were requested, release the entire
-                * mapping and return NULL.  Glibc malloc will then fall back
-                * to using mmap of base pages.
-                *
-                * If we instead returned a hugepage mapping with insufficient
-                * hugepages, the VM system would kill the process when the
-                * process tried to access the missing memory.
-                */
-
-               if (__hugetlbfs_prefault) {
-                       for (offset = 0; offset < delta; ) {
-                               for (i = 0; i < IOV_LEN && offset < delta; i++) 
{
-                                       iov[i].iov_base = p + offset;
-                                       iov[i].iov_len = 1;
-                                       offset += blocksize;
-                               }
-                               ret = readv(zero_fd, iov, i);
-                               if (ret != i) {
-                                       DEBUG("Got %d of %d requested; 
err=%d\n", ret,
-                                                       i, ret < 0 ? errno : 0);
-                                       WARNING("Failed to reserve %ld huge 
pages "
-                                                       "for heap\n", 
delta/blocksize);
-                                       munmap(p, delta);
-                                       return NULL;
-                               }
-                       }
+               /* Fault the region to ensure accesses succeed */
+               if (__lh_hugetlbfs_prefault(zero_fd, p, delta) != 0) {
+                       munmap(p, delta);
+                       return NULL;
                }
 
                /* we now have mmap'd further */
@@ -257,8 +222,7 @@ void __hugetlbfs_setup_morecore(void)
        if (env && strcasecmp(env, "yes") == 0)
                shrink_ok = 1;
 
-       blocksize = gethugepagesize();
-       if (blocksize <= 0) {
+       if (gethugepagesize() <= 0) {
                if (errno == ENOSYS)
                        ERROR("Hugepages unavailable\n");
                else if (errno == EOVERFLOW)
@@ -296,10 +260,10 @@ void __hugetlbfs_setup_morecore(void)
        /* Set some allocator options more appropriate for hugepages */
        
        if (shrink_ok)
-               mallopt(M_TRIM_THRESHOLD, blocksize / 2);
+               mallopt(M_TRIM_THRESHOLD, gethugepagesize() / 2);
        else
                mallopt(M_TRIM_THRESHOLD, -1);
-       mallopt(M_TOP_PAD, blocksize / 2);
+       mallopt(M_TOP_PAD, gethugepagesize() / 2);
        /* we always want to use our morecore, not ordinary mmap().
         * This doesn't appear to prohibit malloc() from falling back
         * to mmap() if we run out of hugepages. */
-- 
1.5.6.3


-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Libhugetlbfs-devel mailing list
Libhugetlbfs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to