On Mon, Aug 04, 2008 at 08:01:04PM +0000, Adam Litke wrote:
> Historically, libhugetlbs has relied on kernel features that either: have been
> known to exist in all supported kernel versions, or are easily detected.  As 
> of
> kernel version 2.6.27-rc1, a new crucial feature has been added that is not
> possible to reliably detect.  Huge page mappings created with the MAP_PRIVATE
> flag will have huge pages reserved up-front.  With private reservations in
> effect, it is safe to allow demand-faulting of the HUGETLB_MORECORE heap which
> can lead to dramatic performance improvements on NUMA systems.  This is only
> safe behavior in the presence of private reservations.
> 
> The only way to identify that a kernel has private reservations support is to
> examine the kernel version to see if it is more recent than when the feature
> appeared.  I am well aware of the drawbacks of using the kernel version to
> affect library behavior but I don't see any alternative.  I would suggest that
> the kernel version should be used only in cases when there is no alternative.
> 
> How it works
> ============
> 
> Kernels are assumed to have a mandatory base version x.y.z (eg. 2.6.17) and 
> one
> optional modifier: a post version (stable tree x.y.z.q) or a pre version
> (x.y.z-{preN|rcN}).  All other version appendices (such as -mmN) are ignored.
> 
> The following ordering rules apply:
>       x.y.z-rc(N) < x.y.z-rc(N+1) < x.y.z < x.y.z.(N) < x.y.z.(N+1)
> 
> When libhugetlbfs initializes, the running kernel version is probed using
> uname.  A list of feature definitions is scanned and those with a minimum
> kernel version have that version compared to the runninng kernel.  If the
> running kernel is found to be equal to or greater than the minimum required
> kernel version, a bit in a feature mask is set to indicate the presence of the
> feature.  A feature can be later checked for by using a simple function that
> checks the bitmask.
> 
> Comments?
> ---
> 
>  Makefile                |    2 -
>  hugetlbfs.h             |    8 ++
>  init.c                  |    1 
>  kernel-features.c       |  179 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  kernel-features.h       |   30 ++++++++
>  libhugetlbfs_internal.h |    1 
>  morecore.c              |   11 +++
>  7 files changed, 231 insertions(+), 1 deletions(-)
>  create mode 100644 kernel-features.c
>  create mode 100644 kernel-features.h
> 
> 
> diff --git a/Makefile b/Makefile
> index 763b28d..8953b5e 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -1,7 +1,7 @@
>  PREFIX = /usr/local
>  EXEDIR = /bin
>  
> -LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o
> +LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o 
> kernel-features.o
>  INSTALL_OBJ_LIBS = libhugetlbfs.so libhugetlbfs.a
>  BIN_OBJ_DIR=obj
>  INSTALL_BIN = hugectl hugeedit
> diff --git a/hugetlbfs.h b/hugetlbfs.h
> index 91d021f..5d0d2bf 100644
> --- a/hugetlbfs.h
> +++ b/hugetlbfs.h
> @@ -49,4 +49,12 @@ typedef unsigned long ghp_t;
>  void *get_huge_pages(size_t len, ghp_t flags);
>  void free_huge_pages(void *ptr);
>  
> +/* Kernel feature testing */
> +enum {
> +     /* Reservations are created for private mappings */
> +     HUGETLB_FEAT_PRIVATE_RESV,

Is this meant to match ...

> +     HUGETLB_FEAT_NR,
> +};
> +int hugetlbfs_test_feature(int feat_code);
> +
>  #endif /* _HUGETLBFS_H */
> diff --git a/init.c b/init.c
> index e1415f5..fcd41cc 100644
> --- a/init.c
> +++ b/init.c
> @@ -22,6 +22,7 @@
>  static void __attribute__ ((constructor)) setup_libhugetlbfs(void)
>  {
>       __hugetlbfs_setup_debug();
> +     __lh_hugetlbfs_setup_features();
>  #ifndef NO_ELFLINK
>       __hugetlbfs_setup_elflink();
>  #endif
> diff --git a/kernel-features.c b/kernel-features.c
> new file mode 100644
> index 0000000..cb79693
> --- /dev/null
> +++ b/kernel-features.c
> @@ -0,0 +1,179 @@
> +/*
> + * libhugetlbfs - Easy use of Linux hugepages
> + * Copyright (C) 2008 Adam Litke, IBM Corporation.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/utsname.h>
> +#include "kernel-features.h"
> +#include "hugetlbfs.h"
> +#include "libhugetlbfs_internal.h"
> +#include "libhugetlbfs_debug.h"
> +
> +static struct kernel_version running_kernel_version;
> +static unsigned int feature_mask;
> +
> +static struct feature kernel_features[] = {
> +     {
> +             .name           = "private_reservations",
> +             .kernel_version = "2.6.27-rc1",
> +     },


... to match the index in here?  If so it would be safer to either make
that explici:

        [HUGETLB_FEAT_PRIVATE_RESV] = { ... }

But that also begs the question will we ever have two different version
streams which might add a feature, or a feature come and go, or indeed
one that goes away only.  Perhaps you might have a .set and .unset
members which add and remove features which would allow any combinations
of features per version, allow features to go away etc, _and_ make the
order of members in this list unimportant.

> +}; 
> +
> +static void debug_kernel_version(void)
> +{
> +     struct kernel_version *ver = &running_kernel_version;
> +
> +     DEBUG("Parsed kernel version: [%u] . [%u] . [%u] ",
> +             ver->major, ver->minor, ver->release);
> +     if (ver->post)
> +             DEBUG_CONT(" [stable-release: %u]\n", ver->post);
> +     else if (ver->pre)
> +             DEBUG_CONT(" [pre-release: %u]\n", ver->pre);
> +     else
> +             DEBUG_CONT("\n");
> +}
> +
> +static int str_to_ver(const char *str, struct kernel_version *ver)
> +{
> +     int err;
> +     int nr_chars;
> +     char extra[4];
> +
> +     /* Clear out version struct */
> +     ver->major = ver->minor = ver->release = ver->post = ver->pre = 0;
> +
> +     /* The kernel always starts x.y.z */
> +     err = sscanf(str, "%u.%u.%u%n", &ver->major, &ver->minor, &ver->release,
> +                     &nr_chars);
> +     /*
> +      * The sscanf man page says that %n may or may not affect the return
> +      * value so make sure it is at least 3 to cover the three kernel
> +      * version variables and assume nr_chars will be correctly assigned.
> +      */
> +     if (err < 3) {
> +             ERROR("Unable to determine base kernel version: %s\n",
> +                     strerror(errno));
> +             return -1;
> +     }
> +
> +     /* Advance the str by the number of characters indicated by sscanf */
> +     str += nr_chars;
> +             
> +     /* Try to match a post/stable version */
> +     err = sscanf(str, ".%u", &ver->post);
> +     if (err == 1)
> +             return 0;
> +
> +     /* Try to match a preN/rcN version */
> +     err = sscanf(str, "-%3[rcpe]%u", extra, &ver->pre);

perhaps -%3[^0-9] 

> +     if ((err == 2) && (!strcmp(extra, "pre") || !strcmp(extra, "rc")))
> +             return 0;
> +     else
> +             ver->pre = 0;

This is a little confusing ... you are saying if it wasn't pre/rc then
reset the ->pre variable to 0.  I think that would be far less
confusing as:

        if ((err != 2) || (strcmp(extra, "pre") != 0 && strcmp(extra, "rc") != 
0)
                ver->pre = 0;

and let it drop through.

> +
> +     /*
> +      * For now we ignore any extraversions besides pre and post versions
> +      * and treat them as equal to the base version.
> +      */
> +     return 0;
> +}
> +
> +static int int_cmp(int a, int b)
> +{
> +     if (a < b)
> +             return -1;
> +     if (b > a)
> +             return 1;
> +     else
> +             return 0;
> +}
> +
> +/*
> + * Pre-release kernels have the following compare rules:
> + *   X.Y.(Z - 1) < X.Y.Z-rcN < X.Y.X
> + * This order can be enforced by simply decrementing the release (for
> + * comparison purposes) when there is a pre/rc modifier in effect.
> + */
> +static int ver_cmp_release(struct kernel_version *ver)
> +{
> +     if (ver->pre)
> +             return ver->release - 1;
> +     else
> +             return ver->release;
> +}
> +
> +static int ver_cmp(struct kernel_version *a, struct kernel_version *b)
> +{
> +     int ret, a_release, b_release;
> +
> +     if ((ret = int_cmp(a->major, b->major)))
> +             return ret;
> +
> +     if ((ret = int_cmp(a->minor, b->minor)))
> +             return ret;
> +
> +     a_release = ver_cmp_release(a);
> +     b_release = ver_cmp_release(b);
> +     if ((ret = int_cmp(a->release, b->release)))
> +             return ret;
> +
> +     if ((ret = int_cmp(a->post, b->post)))
> +             return ret;
> +
> +     if ((ret = int_cmp(a->pre, b->pre)))
> +             return ret;

as the -rcN and .N are effectivly ordered the same you might consider
representing them with one value.  like -1000+rcN and +.N those would
order correctly with 0 for normal releases.

You might also consider just numerically encoding the releases.

major << 24 | minor << 16 | release << 8 | -128 + rcN | .N

Then you can just use numerical comparisons.

> +
> +     /* We ignore forks (such as -mm and -mjb) */
> +     return 0;
> +}
> +
> +int hugetlbfs_test_feature(int feat_code)
> +{
> +     if (feat_code >= HUGETLB_FEAT_NR)
> +             return -EINVAL;
> +     return (feature_mask & feat_code) ? 1 : 0;

As you are comparing against HUGETLB_FEAT_NR the feature codes are 0, 1, 2,
3, 4, 5 etc.  But you then use them directly to &.  For the first feature
this will do nothing me things.  I suspect this should be (feature_mask &
(1 << feature_code)).  Also is there any point in the ?: in there as the
bit value has the correct boolean significance in C anyhow.

> +}
> +
> +void __lh_hugetlbfs_setup_features()

as __lh_ means libhuge do we need the hugetlbfs in here?

> +{
> +     struct utsname u;
> +     int i;
> +
> +     if (uname(&u)) {
> +             ERROR("Getting kernel version failed: %s\n", strerror(errno));
> +             return;
> +     }
> +
> +     str_to_ver(u.release, &running_kernel_version);
> +     debug_kernel_version();
> +
> +     for (i = 0; i < HUGETLB_FEAT_NR; i++) {
> +             struct kernel_version ver;
> +             str_to_ver(kernel_features[i].kernel_version, &ver);
> +
> +             /* Is the running kernel version newer? */
> +             if (ver_cmp(&running_kernel_version, &ver) >= 0) {
> +                     DEBUG("Feature %s is present in this kernel\n",
> +                             kernel_features[i].name);
> +                     feature_mask |= (1UL << i);
> +             }
> +     }               
> +}
> diff --git a/kernel-features.h b/kernel-features.h
> new file mode 100644
> index 0000000..54c0803
> --- /dev/null
> +++ b/kernel-features.h
> @@ -0,0 +1,30 @@
> +/*
> + * libhugetlbfs - Easy use of Linux hugepages
> + * Copyright (C) 2008 Adam Litke, IBM Corporation.
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +struct kernel_version {
> +     unsigned int major;
> +     unsigned int minor;
> +     unsigned int release;
> +     unsigned int post;
> +     unsigned int pre;
> +};
> +
> +struct feature {
> +     char *name;
> +     char *kernel_version;
> +};
> diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h
> index 595cc6e..814f400 100644
> --- a/libhugetlbfs_internal.h
> +++ b/libhugetlbfs_internal.h
> @@ -46,6 +46,7 @@ extern int __hugetlbfs_prefault;
>  extern void __hugetlbfs_setup_elflink();
>  extern void __hugetlbfs_setup_morecore();
>  extern void __hugetlbfs_setup_debug();
> +extern void __lh_hugetlbfs_setup_features();
>  extern char __hugetlbfs_hostname[];
>  
>  #ifndef REPORT
> diff --git a/morecore.c b/morecore.c
> index 46897aa..6402ab1 100644
> --- a/morecore.c
> +++ b/morecore.c
> @@ -239,6 +239,17 @@ void __hugetlbfs_setup_morecore(void)
>       }
>  
>       /*
> +      * If the kernel supports MAP_PRIVATE reservations, we can skip
> +      * prefaulting the huge pages we allocate for the heap since the
> +      * kernel guarantees them.  This can help NUMA performance quite a bit.
> +      */
> +     if (hugetlbfs_test_feature(HUGETLB_FEAT_PRIVATE_RESV)) {
> +             DEBUG("Kernel has MAP_PRIVATE reservations.  Disabling "
> +                     "heap prefaulting.\n");
> +             __hugetlbfs_prefault = 0;
> +     }
> +
> +     /*
>        * We have been seeing some unexpected behavior from malloc when
>        * heap shrinking is enabled, so heap shrinking is disabled by
>        * default.

-apw

-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to