On Mon, 2008-08-18 at 19:02 +0100, [EMAIL PROTECTED] wrote:
> From: Mel Gorman <[EMAIL PROTECTED]>
> 
> A region created for morecore is prefaulted to ensure sufficient hugepages
> exist in the pool.  This was necessary on kernels older than 2.6.27-rc1 as
> MAP_PRIVATE mappings do not reserve pages in advance. Prefaulting is the
> only means of ensuring an application does not receive a SIGBUS due to a
> failed fault.
> 
> The get_huge_pages() API has similar requirements in that it needs to
> prefault a region to ensure future faults succeed. This patch splits out
> prefaulting into a separate utility function so it can be used by
> get_huge_pages().
> 
> Signed-off-by: Mel Gorman <[EMAIL PROTECTED]>

Acked-by: Adam Litke <[EMAIL PROTECTED]>

> ---
>  hugeutils.c             |   47 ++++++++++++++++++++++++++++++++++++++++++
>  libhugetlbfs_internal.h |    1 +
>  morecore.c              |   52 +++++++---------------------------------------
>  3 files changed, 56 insertions(+), 44 deletions(-)
> 
> diff --git a/hugeutils.c b/hugeutils.c
> index cc5113f..995ccc6 100644
> --- a/hugeutils.c
> +++ b/hugeutils.c
> @@ -38,6 +38,7 @@
>  #include <sys/types.h>
>  #include <sys/mman.h>
>  #include <sys/file.h>
> +#include <sys/uio.h>
> 
>  #include "libhugetlbfs_internal.h"
>  #include "hugetlbfs.h"
> @@ -267,6 +268,52 @@ int hugetlbfs_unlinked_fd(void)
>       return fd;
>  }
> 
> +#define IOV_LEN 64
> +int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length)
> +{
> +     /*
> +      * The NUMA users of libhugetlbfs' malloc feature are
> +      * expected to use the numactl program to specify an
> +      * appropriate policy for hugepage allocation
> +      *
> +      * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT
> +      * is set. If we instead returned a hugepage mapping with insufficient
> +      * hugepages, the VM system would kill the process when the
> +      * process tried to access the missing memory.
> +      *
> +      * The value of this environment variable is read during library
> +      * initialisation and sets __hugetlbfs_prefault accordingly. If 
> +      * prefaulting is enabled and we can't get all that were requested,
> +      * -ENOMEM is returned. The caller is expected to release the entire
> +      * mapping and optionally it may recover by mapping base pages instead.
> +      */
> +     if (__hugetlbfs_prefault) {
> +             int i;
> +             size_t offset;
> +             struct iovec iov[IOV_LEN];
> +             int ret;
> +
> +             for (offset = 0; offset < length; ) {
> +                     for (i = 0; i < IOV_LEN && offset < length; i++) {
> +                             iov[i].iov_base = addr + offset;
> +                             iov[i].iov_len = 1;
> +                             offset += gethugepagesize();
> +                     }
> +                     ret = readv(fd, iov, i);
> +                     if (ret != i) {
> +                             DEBUG("Got %d of %d requested; err=%d\n", ret,
> +                                             i, ret < 0 ? errno : 0);
> +                             WARNING("Failed to reserve %ld huge pages "
> +                                             "for new region\n",
> +                                             length / gethugepagesize());
> +                             return -ENOMEM;
> +                     }
> +             }
> +     }
> +
> +     return 0;
> +}
> +
>  /********************************************************************/
>  /* Library user visible DIAGNOSES/DEBUGGING ONLY functions          */
>  /********************************************************************/
> diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h
> index 595cc6e..ce4c23a 100644
> --- a/libhugetlbfs_internal.h
> +++ b/libhugetlbfs_internal.h
> @@ -47,6 +47,7 @@ extern void __hugetlbfs_setup_elflink();
>  extern void __hugetlbfs_setup_morecore();
>  extern void __hugetlbfs_setup_debug();
>  extern char __hugetlbfs_hostname[];
> +extern int __lh_hugetlbfs_prefault(int fd, void *addr, size_t length);
> 
>  #ifndef REPORT
>  #define REPORT(level, prefix, format, ...) \
> diff --git a/morecore.c b/morecore.c
> index 46897aa..6712207 100644
> --- a/morecore.c
> +++ b/morecore.c
> @@ -28,7 +28,6 @@
>  #include <dlfcn.h>
>  #include <string.h>
>  #include <fcntl.h>
> -#include <sys/uio.h>
> 
>  #include "hugetlbfs.h"
> 
> @@ -37,7 +36,6 @@
>  static int heap_fd;
>  static int shrink_ok;                /* default = 0; no shrink */
>  static int zero_fd;
> -static long blocksize;
> 
>  static void *heapbase;
>  static void *heaptop;
> @@ -69,13 +67,8 @@ static long hugetlbfs_next_addr(long addr)
>   * Luckily, if it does not do so and we error out malloc will happily
>   * go back to small pages and use mmap to get them.  Hurrah.
>   */
> -#define IOV_LEN      64
> -
>  static void *hugetlbfs_morecore(ptrdiff_t increment)
>  {
> -     unsigned long offset;
> -     int i;
> -     struct iovec iov[IOV_LEN];
>       int ret;
>       void *p;
>       long delta;
> @@ -92,7 +85,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
>             heapbase, heaptop, mapsize, delta);
> 
>       /* align to multiple of hugepagesize. */
> -     delta = ALIGN(delta, blocksize);
> +     delta = ALIGN(delta, gethugepagesize());
> 
>       if (delta > 0) {
>               /* growing the heap */
> @@ -128,38 +121,10 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
>                       return NULL;
>               }
> 
> -             /* The NUMA users of libhugetlbfs' malloc feature are
> -              * expected to use the numactl program to specify an
> -              * appropriate policy for hugepage allocation */
> -
> -             /*
> -              * Use readv(2) to instantiate the hugepages.  If we
> -              * can't get all that were requested, release the entire
> -              * mapping and return NULL.  Glibc malloc will then fall back
> -              * to using mmap of base pages.
> -              *
> -              * If we instead returned a hugepage mapping with insufficient
> -              * hugepages, the VM system would kill the process when the
> -              * process tried to access the missing memory.
> -              */
> -
> -             if (__hugetlbfs_prefault) {
> -                     for (offset = 0; offset < delta; ) {
> -                             for (i = 0; i < IOV_LEN && offset < delta; i++) 
> {
> -                                     iov[i].iov_base = p + offset;
> -                                     iov[i].iov_len = 1;
> -                                     offset += blocksize;
> -                             }
> -                             ret = readv(zero_fd, iov, i);
> -                             if (ret != i) {
> -                                     DEBUG("Got %d of %d requested; 
> err=%d\n", ret,
> -                                                     i, ret < 0 ? errno : 0);
> -                                     WARNING("Failed to reserve %ld huge 
> pages "
> -                                                     "for heap\n", 
> delta/blocksize);
> -                                     munmap(p, delta);
> -                                     return NULL;
> -                             }
> -                     }
> +             /* Fault the region to ensure accesses succeed */
> +             if (__lh_hugetlbfs_prefault(zero_fd, p, delta) != 0) {
> +                     munmap(p, delta);
> +                     return NULL;
>               }
> 
>               /* we now have mmap'd further */
> @@ -257,8 +222,7 @@ void __hugetlbfs_setup_morecore(void)
>       if (env && strcasecmp(env, "yes") == 0)
>               shrink_ok = 1;
> 
> -     blocksize = gethugepagesize();
> -     if (blocksize <= 0) {
> +     if (gethugepagesize() <= 0) {
>               if (errno == ENOSYS)
>                       ERROR("Hugepages unavailable\n");
>               else if (errno == EOVERFLOW)
> @@ -296,10 +260,10 @@ void __hugetlbfs_setup_morecore(void)
>       /* Set some allocator options more appropriate for hugepages */
>       
>       if (shrink_ok)
> -             mallopt(M_TRIM_THRESHOLD, blocksize / 2);
> +             mallopt(M_TRIM_THRESHOLD, gethugepagesize() / 2);
>       else
>               mallopt(M_TRIM_THRESHOLD, -1);
> -     mallopt(M_TOP_PAD, blocksize / 2);
> +     mallopt(M_TOP_PAD, gethugepagesize() / 2);
>       /* we always want to use our morecore, not ordinary mmap().
>        * This doesn't appear to prohibit malloc() from falling back
>        * to mmap() if we run out of hugepages. */
-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Libhugetlbfs-devel mailing list
Libhugetlbfs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to