Historically, libhugetlbs has relied on kernel features that either: have been
known to exist in all supported kernel versions, or are easily detected.  As of
kernel version 2.6.27-rc1, a new crucial feature has been added that is not
possible to reliably detect.  Huge page mappings created with the MAP_PRIVATE
flag will have huge pages reserved up-front.  With private reservations in
effect, it is safe to allow demand-faulting of the HUGETLB_MORECORE heap which
can lead to dramatic performance improvements on NUMA systems.  This is only
safe behavior in the presence of private reservations.

The only way to identify that a kernel has private reservations support is to
examine the kernel version to see if it is more recent than when the feature
appeared.  I am well aware of the drawbacks of using the kernel version to
affect library behavior but I don't see any alternative.  I would suggest that
the kernel version should be used only in cases when there is no alternative.

How it works
============

Kernels are assumed to have a mandatory base version x.y.z (eg. 2.6.17) and one
optional modifier: a post version (stable tree x.y.z.q) or a pre version
(x.y.z-{preN|rcN}).  All other version appendices (such as -mmN) are ignored.

The following ordering rules apply:
        x.y.z-rc(N) < x.y.z-rc(N+1) < x.y.z < x.y.z.(N) < x.y.z.(N+1)

When libhugetlbfs initializes, the running kernel version is probed using
uname.  A list of feature definitions is scanned and those with a minimum
kernel version have that version compared to the runninng kernel.  If the
running kernel is found to be equal to or greater than the minimum required
kernel version, a bit in a feature mask is set to indicate the presence of the
feature.  A feature can be later checked for by using a simple function that
checks the bitmask.


Changes since V1 (Thanks Andy Whitcroft and Mel Gorman):
 - Fixed feature_mask handling
 - Readability improvements
---

 Makefile                |    2 -
 hugetlbfs.h             |    9 ++
 init.c                  |    1 
 kernel-features.c       |  181 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel-features.h       |   30 ++++++++
 libhugetlbfs_internal.h |    1 
 morecore.c              |   11 +++
 7 files changed, 234 insertions(+), 1 deletions(-)
 create mode 100644 kernel-features.c
 create mode 100644 kernel-features.h


diff --git a/Makefile b/Makefile
index 763b28d..8953b5e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 PREFIX = /usr/local
 EXEDIR = /bin
 
-LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o
+LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o 
kernel-features.o
 INSTALL_OBJ_LIBS = libhugetlbfs.so libhugetlbfs.a
 BIN_OBJ_DIR=obj
 INSTALL_BIN = hugectl hugeedit
diff --git a/hugetlbfs.h b/hugetlbfs.h
index 91d021f..5f73468 100644
--- a/hugetlbfs.h
+++ b/hugetlbfs.h
@@ -49,4 +49,13 @@ typedef unsigned long ghp_t;
 void *get_huge_pages(size_t len, ghp_t flags);
 void free_huge_pages(void *ptr);
 
+/* Kernel feature testing */
+/* This enum defines the bits in a feature bitmask */
+enum {
+       /* Reservations are created for private mappings */
+       HUGETLB_FEATURE_PRIVATE_RESV,
+       HUGETLB_FEATURE_NR,
+};
+int hugetlbfs_test_feature(int feature_code);
+
 #endif /* _HUGETLBFS_H */
diff --git a/init.c b/init.c
index e1415f5..51ad27c 100644
--- a/init.c
+++ b/init.c
@@ -22,6 +22,7 @@
 static void __attribute__ ((constructor)) setup_libhugetlbfs(void)
 {
        __hugetlbfs_setup_debug();
+       __lh_setup_features();
 #ifndef NO_ELFLINK
        __hugetlbfs_setup_elflink();
 #endif
diff --git a/kernel-features.c b/kernel-features.c
new file mode 100644
index 0000000..1b4508c
--- /dev/null
+++ b/kernel-features.c
@@ -0,0 +1,181 @@
+/*
+ * libhugetlbfs - Easy use of Linux hugepages
+ * Copyright (C) 2008 Adam Litke, IBM Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/utsname.h>
+#include "kernel-features.h"
+#include "hugetlbfs.h"
+#include "libhugetlbfs_internal.h"
+#include "libhugetlbfs_debug.h"
+
+static struct kernel_version running_kernel_version;
+
+/* This mask should always be 32 bits, regardless of the platform word size */
+static unsigned int feature_mask;
+
+static struct feature kernel_features[] = {
+       [HUGETLB_FEATURE_PRIVATE_RESV] = {
+               .name                   = "private_reservations",
+               .required_version       = "2.6.27-rc1",
+       },
+}; 
+
+static void debug_kernel_version(void)
+{
+       struct kernel_version *ver = &running_kernel_version;
+
+       DEBUG("Parsed kernel version: [%u] . [%u] . [%u] ",
+               ver->major, ver->minor, ver->release);
+       if (ver->post)
+               DEBUG_CONT(" [post-release: %u]\n", ver->post);
+       else if (ver->pre)
+               DEBUG_CONT(" [pre-release: %u]\n", ver->pre);
+       else
+               DEBUG_CONT("\n");
+}
+
+static int str_to_ver(const char *str, struct kernel_version *ver)
+{
+       int err;
+       int nr_chars;
+       char extra[4];
+
+       /* Clear out version struct */
+       ver->major = ver->minor = ver->release = ver->post = ver->pre = 0;
+
+       /* The kernel always starts x.y.z */
+       err = sscanf(str, "%u.%u.%u%n", &ver->major, &ver->minor, &ver->release,
+                       &nr_chars);
+       /*
+        * The sscanf man page says that %n may or may not affect the return
+        * value so make sure it is at least 3 to cover the three kernel
+        * version variables and assume nr_chars will be correctly assigned.
+        */
+       if (err < 3) {
+               ERROR("Unable to determine base kernel version: %s\n",
+                       strerror(errno));
+               return -1;
+       }
+
+       /* Advance the str by the number of characters indicated by sscanf */
+       str += nr_chars;
+               
+       /* Try to match a post/stable version */
+       err = sscanf(str, ".%u", &ver->post);
+       if (err == 1)
+               return 0;
+
+       /* Try to match a preN/rcN version */
+       err = sscanf(str, "-%3[^0-9]%u", extra, &ver->pre);
+       if (err != 2 || (strcmp(extra, "pre") != 0 && strcmp(extra, "rc") != 0))
+               ver->pre = 0;
+
+       /*
+        * For now we ignore any extraversions besides pre and post versions
+        * and treat them as equal to the base version.
+        */
+       return 0;
+}
+
+static int int_cmp(int a, int b)
+{
+       if (a < b)
+               return -1;
+       if (b > a)
+               return 1;
+       else
+               return 0;
+}
+
+/*
+ * Pre-release kernels have the following compare rules:
+ *     X.Y.(Z - 1) < X.Y.Z-rcN < X.Y.X
+ * This order can be enforced by simply decrementing the release (for
+ * comparison purposes) when there is a pre/rc modifier in effect.
+ */
+static int ver_cmp_release(struct kernel_version *ver)
+{
+       if (ver->pre)
+               return ver->release - 1;
+       else
+               return ver->release;
+}
+
+static int ver_cmp(struct kernel_version *a, struct kernel_version *b)
+{
+       int ret, a_release, b_release;
+
+       if ((ret = int_cmp(a->major, b->major)) != 0)
+               return ret;
+
+       if ((ret = int_cmp(a->minor, b->minor)) != 0)
+               return ret;
+
+       a_release = ver_cmp_release(a);
+       b_release = ver_cmp_release(b);
+       if ((ret = int_cmp(a_release, b_release)) != 0)
+               return ret;
+
+       if ((ret = int_cmp(a->post, b->post)) != 0)
+               return ret;
+
+       if ((ret = int_cmp(a->pre, b->pre)) != 0)
+               return ret;
+
+       /* We ignore forks (such as -mm and -mjb) */
+       return 0;
+}
+
+int hugetlbfs_test_feature(int feature_code)
+{
+       if (feature_code >= HUGETLB_FEATURE_NR) {
+               ERROR("hugetlbfs_test_feature: invalid feature code\n");
+               return -EINVAL;
+       }
+       return feature_mask & (1 << feature_code);
+}
+
+void __lh_setup_features()
+{
+       struct utsname u;
+       int i;
+
+       if (uname(&u)) {
+               ERROR("Getting kernel version failed: %s\n", strerror(errno));
+               return;
+       }
+
+       str_to_ver(u.release, &running_kernel_version);
+       debug_kernel_version();
+
+       for (i = 0; i < HUGETLB_FEATURE_NR; i++) {
+               struct kernel_version ver;
+               str_to_ver(kernel_features[i].required_version, &ver);
+
+               /* Is the running kernel version newer? */
+               if (ver_cmp(&running_kernel_version, &ver) >= 0) {
+                       DEBUG("Feature %s is present in this kernel\n",
+                               kernel_features[i].name);
+                       feature_mask |= (1UL << i);
+               }
+       }               
+}
diff --git a/kernel-features.h b/kernel-features.h
new file mode 100644
index 0000000..e1b6ca9
--- /dev/null
+++ b/kernel-features.h
@@ -0,0 +1,30 @@
+/*
+ * libhugetlbfs - Easy use of Linux hugepages
+ * Copyright (C) 2008 Adam Litke, IBM Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+struct kernel_version {
+       unsigned int major;
+       unsigned int minor;
+       unsigned int release;
+       unsigned int post;
+       unsigned int pre;
+};
+
+struct feature {
+       char *name;
+       char *required_version;
+};
diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h
index 595cc6e..38bc615 100644
--- a/libhugetlbfs_internal.h
+++ b/libhugetlbfs_internal.h
@@ -46,6 +46,7 @@ extern int __hugetlbfs_prefault;
 extern void __hugetlbfs_setup_elflink();
 extern void __hugetlbfs_setup_morecore();
 extern void __hugetlbfs_setup_debug();
+extern void __lh_setup_features();
 extern char __hugetlbfs_hostname[];
 
 #ifndef REPORT
diff --git a/morecore.c b/morecore.c
index 46897aa..85d9371 100644
--- a/morecore.c
+++ b/morecore.c
@@ -239,6 +239,17 @@ void __hugetlbfs_setup_morecore(void)
        }
 
        /*
+        * If the kernel supports MAP_PRIVATE reservations, we can skip
+        * prefaulting the huge pages we allocate for the heap since the
+        * kernel guarantees them.  This can help NUMA performance quite a bit.
+        */
+       if (hugetlbfs_test_feature(HUGETLB_FEATURE_PRIVATE_RESV)) {
+               DEBUG("Kernel has MAP_PRIVATE reservations.  Disabling "
+                       "heap prefaulting.\n");
+               __hugetlbfs_prefault = 0;
+       }
+
+       /*
         * We have been seeing some unexpected behavior from malloc when
         * heap shrinking is enabled, so heap shrinking is disabled by
         * default.


-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Libhugetlbfs-devel mailing list
Libhugetlbfs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to