On 22.08.2006 [19:08:40 -0700], Nishanth Aravamudan wrote: > Hi, > > Here is my attempt at reinstating the mlocking guarantee for morecore. > The issue previously was that we would fault in hugepages on the current > node only, leading to terrible NUMA performance. Instead, we now check > the current mempolicy and if it's DEFAULT (which acc'g to the man mbind > page means "Unless the process policy has been changed this means to > allocate memory on the node of the CPU that triggered the allocation.") > we change it to INTERLEAVE. I think we want to respect the policy if > it's BIND or PREFERRED, although maybe only the latter is really > important. > > The NUMA API man-pages are really bad, so I'll probably spend some time > now creating patches for them, based upon my reading of the > corresponding kernel code. > > Unfortunately, this would introduce a dependency on libnuma, as > otherwise the get_mempolicy() and mbind() calls have no definition :( So > I'm emulating them with indirect syscalls. > > I'm going to go and test this now on a non-NUMA machine until I can find > access to a larger NUMA machine where this might make a difference, but > wanted to get the patch out there, because I'm not entirely sure I know > what I'm doing :) > > Completely only an RFC right now, not requesting inclusion, so not > Signed-off.
Second try, compile-tested and run-tested on a non-NUMA machine (passes make func). Will hopefully have time to test on a NUMA box tomorrow. Still not Signed-off. I was really trying to avoid using libnuma, but I ended up just stealing code from it. I've trimmed it down to the minimal amount, but am open for further (or better) suggestions. Thanks, Nish diff --git a/morecore.c b/morecore.c index 9f13316..01ac8ae 100644 --- a/morecore.c +++ b/morecore.c @@ -26,6 +26,11 @@ #include <sys/mman.h> #include <errno.h> #include <dlfcn.h> +#include <string.h> +#include <numaif.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <dirent.h> #include "hugetlbfs.h" @@ -49,10 +54,94 @@ static long mapsize; * go back to small pages and use mmap to get them. Hurrah. */ +#if defined(__x86_64__) || defined(__i386__) +#define NUMA_NUM_NODES 128 +#else +#define NUMA_NUM_NODES 2048 +#endif + +#ifndef BITS_PER_LONG +#define BITS_PER_LONG (8*sizeof(unsigned long)) +#endif + +static int respect_policy; +/* adapted from libnuma source */ +static int numa_is_available; +static unsigned long nodemask[NUMA_NUM_NODES/BITS_PER_LONG]; + +static int numa_max_node(void) +{ + DIR *d; + struct dirent *de; + int found, maxnode = 0; + + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + found = 0; + while ((de = readdir(d)) != NULL) { + int nd; + if (strncmp(de->d_name, "node", 4)) + continue; + found++; + nd = strtoul(de->d_name+4, NULL, 0); + if (maxnode < nd) + maxnode = nd; + } + closedir(d); + if (found == 0) + return 0; + return maxnode; +} + +static void setup_numa_if_available(void) +{ + int i, maxnode; + + if (syscall(__NR_get_mempolicy, NULL, NULL, 0, 0, 0) < 0 + && errno == ENOSYS) { + numa_is_available = 0; + return; + } + + numa_is_available = 1; + + maxnode = numa_max_node(); + for (i = 0; i <= maxnode; i++) + nodemask[i / BITS_PER_LONG] |= (1UL<<(i%BITS_PER_LONG)); +} + +static int guarantee_memory(void *p, long size) +{ + int ret; + + /* + * Override the NUMA policy unless told not to by the environment + * + * Default to interleaving at fault-time to avoid having all the + * hugepages being allocated on the current node. + */ + if (numa_is_available && (respect_policy == 0)) + if (syscall(__NR_mbind, p, size, MPOL_INTERLEAVE, nodemask, + NUMA_NUM_NODES+1, 0) < 0) + WARNING("mbind() failed: %s\n", strerror(errno)); + + ret = mlock(p, size); + if (ret < 0) { + WARNING("mlock() failed: %s\n", + strerror(errno)); + return ret; + } + munlock(p, size); + + return 0; +} + static void *hugetlbfs_morecore(ptrdiff_t increment) { void *p; long newsize = 0; + int ret; DEBUG("hugetlbfs_morecore(%ld) = ...\n", (long)increment); @@ -86,20 +175,14 @@ static void *hugetlbfs_morecore(ptrdiff_ return NULL; } -#if 0 -/* Use of mlock is disabled because it results in bad numa behavior since - * the malloc'd memory is allocated node-local to the cpu calling morecore() - * and not to the cpu(s) that are actually using the memory. - */ - /* Use mlock to guarantee these pages to the process */ - ret = mlock(p, newsize); - if (ret) { + + /* Use mbind and mlock to guarantee these pages to the process */ + ret = guarantee_memory(p, newsize); + if (ret < 0) { WARNING("Failed to reserve huge pages in hugetlbfs_morecore()\n"); munmap(p, newsize); return NULL; } - munlock(p, newsize); -#endif mapsize += newsize; } @@ -120,6 +203,10 @@ static void __attribute__((constructor)) if (! env) return; + env = getenv("HUGETLB_MORECORE_RESPECT_POLICY"); + if (env) + respect_policy = atoi(env); + blocksize = gethugepagesize(); if (! blocksize) { ERROR("Hugepages unavailable\n"); @@ -157,4 +244,6 @@ static void __attribute__((constructor)) * This doesn't appear to prohibit malloc() from falling back * to mmap() if we run out of hugepages. */ mallopt(M_MMAP_MAX, 0); + + setup_numa_if_available(); } -- Nishanth Aravamudan <[EMAIL PROTECTED]> IBM Linux Technology Center ------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel