From: Mel Gorman <[EMAIL PROTECTED]> A region created for morecore is prefaulted to ensure sufficient hugepages exist in the pool. This was necessary on kernels older than 2.6.27-rc1 as MAP_PRIVATE mappings do not reserve pages in advance. Prefaulting is the only means of ensuring an application does not receive a SIGBUS due to a failed fault.
The get_huge_pages() API has similar requirements in that it needs to prefault a region to ensure future faults succeed. This patch splits out prefaulting into a separate utility function so it can be used by get_huge_pages(). Signed-off-by: Mel Gorman <[EMAIL PROTECTED]> --- hugeutils.c | 47 ++++++++++++++++++++++++++++++++++++++++++ libhugetlbfs_internal.h | 1 + morecore.c | 52 +++++++--------------------------------------- 3 files changed, 56 insertions(+), 44 deletions(-) diff --git a/hugeutils.c b/hugeutils.c index cc5113f..995ccc6 100644 --- a/hugeutils.c +++ b/hugeutils.c @@ -38,6 +38,7 @@ #include <sys/types.h> #include <sys/mman.h> #include <sys/file.h> +#include <sys/uio.h> #include "libhugetlbfs_internal.h" #include "hugetlbfs.h" @@ -267,6 +268,52 @@ int hugetlbfs_unlinked_fd(void) return fd; } +#define IOV_LEN 64 +int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length) +{ + /* + * The NUMA users of libhugetlbfs' malloc feature are + * expected to use the numactl program to specify an + * appropriate policy for hugepage allocation + * + * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT + * is set. If we instead returned a hugepage mapping with insufficient + * hugepages, the VM system would kill the process when the + * process tried to access the missing memory. + * + * The value of this environment variable is read during library + * initialisation and sets __hugetlbfs_prefault accordingly. If + * prefaulting is enabled and we can't get all that were requested, + * -ENOMEM is returned. The caller is expected to release the entire + * mapping and optionally it may recover by mapping base pages instead. + */ + if (__hugetlbfs_prefault) { + int i; + size_t offset; + struct iovec iov[IOV_LEN]; + int ret; + + for (offset = 0; offset < length; ) { + for (i = 0; i < IOV_LEN && offset < length; i++) { + iov[i].iov_base = addr + offset; + iov[i].iov_len = 1; + offset += gethugepagesize(); + } + ret = readv(fd, iov, i); + if (ret != i) { + DEBUG("Got %d of %d requested; err=%d\n", ret, + i, ret < 0 ? errno : 0); + WARNING("Failed to reserve %ld huge pages " + "for new region\n", + length / gethugepagesize()); + return -ENOMEM; + } + } + } + + return 0; +} + /********************************************************************/ /* Library user visible DIAGNOSES/DEBUGGING ONLY functions */ /********************************************************************/ diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h index 595cc6e..ce4c23a 100644 --- a/libhugetlbfs_internal.h +++ b/libhugetlbfs_internal.h @@ -47,6 +47,7 @@ extern void __hugetlbfs_setup_elflink(); extern void __hugetlbfs_setup_morecore(); extern void __hugetlbfs_setup_debug(); extern char __hugetlbfs_hostname[]; +extern int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length); #ifndef REPORT #define REPORT(level, prefix, format, ...) \ diff --git a/morecore.c b/morecore.c index 46897aa..6712207 100644 --- a/morecore.c +++ b/morecore.c @@ -28,7 +28,6 @@ #include <dlfcn.h> #include <string.h> #include <fcntl.h> -#include <sys/uio.h> #include "hugetlbfs.h" @@ -37,7 +36,6 @@ static int heap_fd; static int shrink_ok; /* default = 0; no shrink */ static int zero_fd; -static long blocksize; static void *heapbase; static void *heaptop; @@ -69,13 +67,8 @@ static long hugetlbfs_next_addr(long addr) * Luckily, if it does not do so and we error out malloc will happily * go back to small pages and use mmap to get them. Hurrah. */ -#define IOV_LEN 64 - static void *hugetlbfs_morecore(ptrdiff_t increment) { - unsigned long offset; - int i; - struct iovec iov[IOV_LEN]; int ret; void *p; long delta; @@ -92,7 +85,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) heapbase, heaptop, mapsize, delta); /* align to multiple of hugepagesize. */ - delta = ALIGN(delta, blocksize); + delta = ALIGN(delta, gethugepagesize()); if (delta > 0) { /* growing the heap */ @@ -128,38 +121,10 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) return NULL; } - /* The NUMA users of libhugetlbfs' malloc feature are - * expected to use the numactl program to specify an - * appropriate policy for hugepage allocation */ - - /* - * Use readv(2) to instantiate the hugepages. If we - * can't get all that were requested, release the entire - * mapping and return NULL. Glibc malloc will then fall back - * to using mmap of base pages. - * - * If we instead returned a hugepage mapping with insufficient - * hugepages, the VM system would kill the process when the - * process tried to access the missing memory. - */ - - if (__hugetlbfs_prefault) { - for (offset = 0; offset < delta; ) { - for (i = 0; i < IOV_LEN && offset < delta; i++) { - iov[i].iov_base = p + offset; - iov[i].iov_len = 1; - offset += blocksize; - } - ret = readv(zero_fd, iov, i); - if (ret != i) { - DEBUG("Got %d of %d requested; err=%d\n", ret, - i, ret < 0 ? errno : 0); - WARNING("Failed to reserve %ld huge pages " - "for heap\n", delta/blocksize); - munmap(p, delta); - return NULL; - } - } + /* Fault the region to ensure accesses succeed */ + if (__lh_hugetlbfs_prefault(zero_fd, p, delta) != 0) { + munmap(p, delta); + return NULL; } /* we now have mmap'd further */ @@ -257,8 +222,7 @@ void __hugetlbfs_setup_morecore(void) if (env && strcasecmp(env, "yes") == 0) shrink_ok = 1; - blocksize = gethugepagesize(); - if (blocksize <= 0) { + if (gethugepagesize() <= 0) { if (errno == ENOSYS) ERROR("Hugepages unavailable\n"); else if (errno == EOVERFLOW) @@ -296,10 +260,10 @@ void __hugetlbfs_setup_morecore(void) /* Set some allocator options more appropriate for hugepages */ if (shrink_ok) - mallopt(M_TRIM_THRESHOLD, blocksize / 2); + mallopt(M_TRIM_THRESHOLD, gethugepagesize() / 2); else mallopt(M_TRIM_THRESHOLD, -1); - mallopt(M_TOP_PAD, blocksize / 2); + mallopt(M_TOP_PAD, gethugepagesize() / 2); /* we always want to use our morecore, not ordinary mmap(). * This doesn't appear to prohibit malloc() from falling back * to mmap() if we run out of hugepages. */ -- 1.5.6.3 ------------------------------------------------------------------------- This SF.Net email is sponsored by the Moblin Your Move Developer's challenge Build the coolest Linux based applications with Moblin SDK & win great prizes Grand prize is a trip for two to an Open Source event anywhere in the world http://moblin-contest.org/redirect.php?banner_id=100&url=/ _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel