Historically, libhugetlbs has relied on kernel features that either: have been
known to exist in all supported kernel versions, or are easily detected. As of
kernel version 2.6.27-rc1, a new crucial feature has been added that is not
possible to reliably detect. Huge page mappings created with the MAP_PRIVATE
flag will have huge pages reserved up-front. With private reservations in
effect, it is safe to allow demand-faulting of the HUGETLB_MORECORE heap which
can lead to dramatic performance improvements on NUMA systems. This is only
safe behavior in the presence of private reservations.
The only way to identify that a kernel has private reservations support is to
examine the kernel version to see if it is more recent than when the feature
appeared. I am well aware of the drawbacks of using the kernel version to
affect library behavior but I don't see any alternative. I would suggest that
the kernel version should be used only in cases when there is no alternative.
How it works
============
Kernels are assumed to have a mandatory base version x.y.z (eg. 2.6.17) and one
optional modifier: a post version (stable tree x.y.z.q) or a pre version
(x.y.z-{preN|rcN}). All other version appendices (such as -mmN) are ignored.
The following ordering rules apply:
x.y.z-rc(N) < x.y.z-rc(N+1) < x.y.z < x.y.z.(N) < x.y.z.(N+1)
When libhugetlbfs initializes, the running kernel version is probed using
uname. A list of feature definitions is scanned and those with a minimum
kernel version have that version compared to the runninng kernel. If the
running kernel is found to be equal to or greater than the minimum required
kernel version, a bit in a feature mask is set to indicate the presence of the
feature. A feature can be later checked for by using a simple function that
checks the bitmask.
Comments?
---
Makefile | 2 -
hugetlbfs.h | 8 ++
init.c | 1
kernel-features.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++
kernel-features.h | 30 ++++++++
libhugetlbfs_internal.h | 1
morecore.c | 11 +++
7 files changed, 231 insertions(+), 1 deletions(-)
create mode 100644 kernel-features.c
create mode 100644 kernel-features.h
diff --git a/Makefile b/Makefile
index 763b28d..8953b5e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
PREFIX = /usr/local
EXEDIR = /bin
-LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o
+LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o
kernel-features.o
INSTALL_OBJ_LIBS = libhugetlbfs.so libhugetlbfs.a
BIN_OBJ_DIR=obj
INSTALL_BIN = hugectl hugeedit
diff --git a/hugetlbfs.h b/hugetlbfs.h
index 91d021f..5d0d2bf 100644
--- a/hugetlbfs.h
+++ b/hugetlbfs.h
@@ -49,4 +49,12 @@ typedef unsigned long ghp_t;
void *get_huge_pages(size_t len, ghp_t flags);
void free_huge_pages(void *ptr);
+/* Kernel feature testing */
+enum {
+ /* Reservations are created for private mappings */
+ HUGETLB_FEAT_PRIVATE_RESV,
+ HUGETLB_FEAT_NR,
+};
+int hugetlbfs_test_feature(int feat_code);
+
#endif /* _HUGETLBFS_H */
diff --git a/init.c b/init.c
index e1415f5..fcd41cc 100644
--- a/init.c
+++ b/init.c
@@ -22,6 +22,7 @@
static void __attribute__ ((constructor)) setup_libhugetlbfs(void)
{
__hugetlbfs_setup_debug();
+ __lh_hugetlbfs_setup_features();
#ifndef NO_ELFLINK
__hugetlbfs_setup_elflink();
#endif
diff --git a/kernel-features.c b/kernel-features.c
new file mode 100644
index 0000000..cb79693
--- /dev/null
+++ b/kernel-features.c
@@ -0,0 +1,179 @@
+/*
+ * libhugetlbfs - Easy use of Linux hugepages
+ * Copyright (C) 2008 Adam Litke, IBM Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/utsname.h>
+#include "kernel-features.h"
+#include "hugetlbfs.h"
+#include "libhugetlbfs_internal.h"
+#include "libhugetlbfs_debug.h"
+
+static struct kernel_version running_kernel_version;
+static unsigned int feature_mask;
+
+static struct feature kernel_features[] = {
+ {
+ .name = "private_reservations",
+ .kernel_version = "2.6.27-rc1",
+ },
+};
+
+static void debug_kernel_version(void)
+{
+ struct kernel_version *ver = &running_kernel_version;
+
+ DEBUG("Parsed kernel version: [%u] . [%u] . [%u] ",
+ ver->major, ver->minor, ver->release);
+ if (ver->post)
+ DEBUG_CONT(" [stable-release: %u]\n", ver->post);
+ else if (ver->pre)
+ DEBUG_CONT(" [pre-release: %u]\n", ver->pre);
+ else
+ DEBUG_CONT("\n");
+}
+
+static int str_to_ver(const char *str, struct kernel_version *ver)
+{
+ int err;
+ int nr_chars;
+ char extra[4];
+
+ /* Clear out version struct */
+ ver->major = ver->minor = ver->release = ver->post = ver->pre = 0;
+
+ /* The kernel always starts x.y.z */
+ err = sscanf(str, "%u.%u.%u%n", &ver->major, &ver->minor, &ver->release,
+ &nr_chars);
+ /*
+ * The sscanf man page says that %n may or may not affect the return
+ * value so make sure it is at least 3 to cover the three kernel
+ * version variables and assume nr_chars will be correctly assigned.
+ */
+ if (err < 3) {
+ ERROR("Unable to determine base kernel version: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /* Advance the str by the number of characters indicated by sscanf */
+ str += nr_chars;
+
+ /* Try to match a post/stable version */
+ err = sscanf(str, ".%u", &ver->post);
+ if (err == 1)
+ return 0;
+
+ /* Try to match a preN/rcN version */
+ err = sscanf(str, "-%3[rcpe]%u", extra, &ver->pre);
+ if ((err == 2) && (!strcmp(extra, "pre") || !strcmp(extra, "rc")))
+ return 0;
+ else
+ ver->pre = 0;
+
+ /*
+ * For now we ignore any extraversions besides pre and post versions
+ * and treat them as equal to the base version.
+ */
+ return 0;
+}
+
+static int int_cmp(int a, int b)
+{
+ if (a < b)
+ return -1;
+ if (b > a)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Pre-release kernels have the following compare rules:
+ * X.Y.(Z - 1) < X.Y.Z-rcN < X.Y.X
+ * This order can be enforced by simply decrementing the release (for
+ * comparison purposes) when there is a pre/rc modifier in effect.
+ */
+static int ver_cmp_release(struct kernel_version *ver)
+{
+ if (ver->pre)
+ return ver->release - 1;
+ else
+ return ver->release;
+}
+
+static int ver_cmp(struct kernel_version *a, struct kernel_version *b)
+{
+ int ret, a_release, b_release;
+
+ if ((ret = int_cmp(a->major, b->major)))
+ return ret;
+
+ if ((ret = int_cmp(a->minor, b->minor)))
+ return ret;
+
+ a_release = ver_cmp_release(a);
+ b_release = ver_cmp_release(b);
+ if ((ret = int_cmp(a->release, b->release)))
+ return ret;
+
+ if ((ret = int_cmp(a->post, b->post)))
+ return ret;
+
+ if ((ret = int_cmp(a->pre, b->pre)))
+ return ret;
+
+ /* We ignore forks (such as -mm and -mjb) */
+ return 0;
+}
+
+int hugetlbfs_test_feature(int feat_code)
+{
+ if (feat_code >= HUGETLB_FEAT_NR)
+ return -EINVAL;
+ return (feature_mask & feat_code) ? 1 : 0;
+}
+
+void __lh_hugetlbfs_setup_features()
+{
+ struct utsname u;
+ int i;
+
+ if (uname(&u)) {
+ ERROR("Getting kernel version failed: %s\n", strerror(errno));
+ return;
+ }
+
+ str_to_ver(u.release, &running_kernel_version);
+ debug_kernel_version();
+
+ for (i = 0; i < HUGETLB_FEAT_NR; i++) {
+ struct kernel_version ver;
+ str_to_ver(kernel_features[i].kernel_version, &ver);
+
+ /* Is the running kernel version newer? */
+ if (ver_cmp(&running_kernel_version, &ver) >= 0) {
+ DEBUG("Feature %s is present in this kernel\n",
+ kernel_features[i].name);
+ feature_mask |= (1UL << i);
+ }
+ }
+}
diff --git a/kernel-features.h b/kernel-features.h
new file mode 100644
index 0000000..54c0803
--- /dev/null
+++ b/kernel-features.h
@@ -0,0 +1,30 @@
+/*
+ * libhugetlbfs - Easy use of Linux hugepages
+ * Copyright (C) 2008 Adam Litke, IBM Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+struct kernel_version {
+ unsigned int major;
+ unsigned int minor;
+ unsigned int release;
+ unsigned int post;
+ unsigned int pre;
+};
+
+struct feature {
+ char *name;
+ char *kernel_version;
+};
diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h
index 595cc6e..814f400 100644
--- a/libhugetlbfs_internal.h
+++ b/libhugetlbfs_internal.h
@@ -46,6 +46,7 @@ extern int __hugetlbfs_prefault;
extern void __hugetlbfs_setup_elflink();
extern void __hugetlbfs_setup_morecore();
extern void __hugetlbfs_setup_debug();
+extern void __lh_hugetlbfs_setup_features();
extern char __hugetlbfs_hostname[];
#ifndef REPORT
diff --git a/morecore.c b/morecore.c
index 46897aa..6402ab1 100644
--- a/morecore.c
+++ b/morecore.c
@@ -239,6 +239,17 @@ void __hugetlbfs_setup_morecore(void)
}
/*
+ * If the kernel supports MAP_PRIVATE reservations, we can skip
+ * prefaulting the huge pages we allocate for the heap since the
+ * kernel guarantees them. This can help NUMA performance quite a bit.
+ */
+ if (hugetlbfs_test_feature(HUGETLB_FEAT_PRIVATE_RESV)) {
+ DEBUG("Kernel has MAP_PRIVATE reservations. Disabling "
+ "heap prefaulting.\n");
+ __hugetlbfs_prefault = 0;
+ }
+
+ /*
* We have been seeing some unexpected behavior from malloc when
* heap shrinking is enabled, so heap shrinking is disabled by
* default.
-------------------------------------------------------------------------
This SF.Net email is sponsored by the Moblin Your Move Developer's challenge
Build the coolest Linux based applications with Moblin SDK & win great prizes
Grand prize is a trip for two to an Open Source event anywhere in the world
http://moblin-contest.org/redirect.php?banner_id=100&url=/
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel