On Mon, 2008-08-18 at 19:02 +0100, [EMAIL PROTECTED] wrote: > From: Mel Gorman <[EMAIL PROTECTED]> > > A region created for morecore is prefaulted to ensure sufficient hugepages > exist in the pool. This was necessary on kernels older than 2.6.27-rc1 as > MAP_PRIVATE mappings do not reserve pages in advance. Prefaulting is the > only means of ensuring an application does not receive a SIGBUS due to a > failed fault. > > The get_huge_pages() API has similar requirements in that it needs to > prefault a region to ensure future faults succeed. This patch splits out > prefaulting into a separate utility function so it can be used by > get_huge_pages(). > > Signed-off-by: Mel Gorman <[EMAIL PROTECTED]>
Acked-by: Adam Litke <[EMAIL PROTECTED]> > --- > hugeutils.c | 47 ++++++++++++++++++++++++++++++++++++++++++ > libhugetlbfs_internal.h | 1 + > morecore.c | 52 +++++++--------------------------------------- > 3 files changed, 56 insertions(+), 44 deletions(-) > > diff --git a/hugeutils.c b/hugeutils.c > index cc5113f..995ccc6 100644 > --- a/hugeutils.c > +++ b/hugeutils.c > @@ -38,6 +38,7 @@ > #include <sys/types.h> > #include <sys/mman.h> > #include <sys/file.h> > +#include <sys/uio.h> > > #include "libhugetlbfs_internal.h" > #include "hugetlbfs.h" > @@ -267,6 +268,52 @@ int hugetlbfs_unlinked_fd(void) > return fd; > } > > +#define IOV_LEN 64 > +int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length) > +{ > + /* > + * The NUMA users of libhugetlbfs' malloc feature are > + * expected to use the numactl program to specify an > + * appropriate policy for hugepage allocation > + * > + * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT > + * is set. If we instead returned a hugepage mapping with insufficient > + * hugepages, the VM system would kill the process when the > + * process tried to access the missing memory. > + * > + * The value of this environment variable is read during library > + * initialisation and sets __hugetlbfs_prefault accordingly. If > + * prefaulting is enabled and we can't get all that were requested, > + * -ENOMEM is returned. The caller is expected to release the entire > + * mapping and optionally it may recover by mapping base pages instead. > + */ > + if (__hugetlbfs_prefault) { > + int i; > + size_t offset; > + struct iovec iov[IOV_LEN]; > + int ret; > + > + for (offset = 0; offset < length; ) { > + for (i = 0; i < IOV_LEN && offset < length; i++) { > + iov[i].iov_base = addr + offset; > + iov[i].iov_len = 1; > + offset += gethugepagesize(); > + } > + ret = readv(fd, iov, i); > + if (ret != i) { > + DEBUG("Got %d of %d requested; err=%d\n", ret, > + i, ret < 0 ? errno : 0); > + WARNING("Failed to reserve %ld huge pages " > + "for new region\n", > + length / gethugepagesize()); > + return -ENOMEM; > + } > + } > + } > + > + return 0; > +} > + > /********************************************************************/ > /* Library user visible DIAGNOSES/DEBUGGING ONLY functions */ > /********************************************************************/ > diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h > index 595cc6e..ce4c23a 100644 > --- a/libhugetlbfs_internal.h > +++ b/libhugetlbfs_internal.h > @@ -47,6 +47,7 @@ extern void __hugetlbfs_setup_elflink(); > extern void __hugetlbfs_setup_morecore(); > extern void __hugetlbfs_setup_debug(); > extern char __hugetlbfs_hostname[]; > +extern int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length); > > #ifndef REPORT > #define REPORT(level, prefix, format, ...) \ > diff --git a/morecore.c b/morecore.c > index 46897aa..6712207 100644 > --- a/morecore.c > +++ b/morecore.c > @@ -28,7 +28,6 @@ > #include <dlfcn.h> > #include <string.h> > #include <fcntl.h> > -#include <sys/uio.h> > > #include "hugetlbfs.h" > > @@ -37,7 +36,6 @@ > static int heap_fd; > static int shrink_ok; /* default = 0; no shrink */ > static int zero_fd; > -static long blocksize; > > static void *heapbase; > static void *heaptop; > @@ -69,13 +67,8 @@ static long hugetlbfs_next_addr(long addr) > * Luckily, if it does not do so and we error out malloc will happily > * go back to small pages and use mmap to get them. Hurrah. > */ > -#define IOV_LEN 64 > - > static void *hugetlbfs_morecore(ptrdiff_t increment) > { > - unsigned long offset; > - int i; > - struct iovec iov[IOV_LEN]; > int ret; > void *p; > long delta; > @@ -92,7 +85,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) > heapbase, heaptop, mapsize, delta); > > /* align to multiple of hugepagesize. */ > - delta = ALIGN(delta, blocksize); > + delta = ALIGN(delta, gethugepagesize()); > > if (delta > 0) { > /* growing the heap */ > @@ -128,38 +121,10 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) > return NULL; > } > > - /* The NUMA users of libhugetlbfs' malloc feature are > - * expected to use the numactl program to specify an > - * appropriate policy for hugepage allocation */ > - > - /* > - * Use readv(2) to instantiate the hugepages. If we > - * can't get all that were requested, release the entire > - * mapping and return NULL. Glibc malloc will then fall back > - * to using mmap of base pages. > - * > - * If we instead returned a hugepage mapping with insufficient > - * hugepages, the VM system would kill the process when the > - * process tried to access the missing memory. > - */ > - > - if (__hugetlbfs_prefault) { > - for (offset = 0; offset < delta; ) { > - for (i = 0; i < IOV_LEN && offset < delta; i++) > { > - iov[i].iov_base = p + offset; > - iov[i].iov_len = 1; > - offset += blocksize; > - } > - ret = readv(zero_fd, iov, i); > - if (ret != i) { > - DEBUG("Got %d of %d requested; > err=%d\n", ret, > - i, ret < 0 ? errno : 0); > - WARNING("Failed to reserve %ld huge > pages " > - "for heap\n", > delta/blocksize); > - munmap(p, delta); > - return NULL; > - } > - } > + /* Fault the region to ensure accesses succeed */ > + if (__lh_hugetlbfs_prefault(zero_fd, p, delta) != 0) { > + munmap(p, delta); > + return NULL; > } > > /* we now have mmap'd further */ > @@ -257,8 +222,7 @@ void __hugetlbfs_setup_morecore(void) > if (env && strcasecmp(env, "yes") == 0) > shrink_ok = 1; > > - blocksize = gethugepagesize(); > - if (blocksize <= 0) { > + if (gethugepagesize() <= 0) { > if (errno == ENOSYS) > ERROR("Hugepages unavailable\n"); > else if (errno == EOVERFLOW) > @@ -296,10 +260,10 @@ void __hugetlbfs_setup_morecore(void) > /* Set some allocator options more appropriate for hugepages */ > > if (shrink_ok) > - mallopt(M_TRIM_THRESHOLD, blocksize / 2); > + mallopt(M_TRIM_THRESHOLD, gethugepagesize() / 2); > else > mallopt(M_TRIM_THRESHOLD, -1); > - mallopt(M_TOP_PAD, blocksize / 2); > + mallopt(M_TOP_PAD, gethugepagesize() / 2); > /* we always want to use our morecore, not ordinary mmap(). > * This doesn't appear to prohibit malloc() from falling back > * to mmap() if we run out of hugepages. */ -- Adam Litke - (agl at us.ibm.com) IBM Linux Technology Center ------------------------------------------------------------------------- This SF.Net email is sponsored by the Moblin Your Move Developer's challenge Build the coolest Linux based applications with Moblin SDK & win great prizes Grand prize is a trip for two to an Open Source event anywhere in the world http://moblin-contest.org/redirect.php?banner_id=100&url=/ _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel