On 22.08.2006 [19:08:40 -0700], Nishanth Aravamudan wrote:
> Hi,
> 
> Here is my attempt at reinstating the mlocking guarantee for morecore.
> The issue previously was that we would fault in hugepages on the current
> node only, leading to terrible NUMA performance. Instead, we now check
> the current mempolicy and if it's DEFAULT (which acc'g to the man mbind
> page means "Unless the process policy has been changed this means to
> allocate memory on the node of the CPU that triggered the allocation.")
> we change it to INTERLEAVE. I think we want to respect the policy if
> it's BIND or PREFERRED, although maybe only the latter is really
> important.
> 
> The NUMA API man-pages are really bad, so I'll probably spend some time
> now creating patches for them, based upon my reading of the
> corresponding kernel code.
> 
> Unfortunately, this would introduce a dependency on libnuma, as
> otherwise the get_mempolicy() and mbind() calls have no definition :( So
> I'm emulating them with indirect syscalls.
> 
> I'm going to go and test this now on a non-NUMA machine until I can find
> access to a larger NUMA machine where this might make a difference, but
> wanted to get the patch out there, because I'm not entirely sure I know
> what I'm doing :)
> 
> Completely only an RFC right now, not requesting inclusion, so not
> Signed-off.

Second try, compile-tested and run-tested on a non-NUMA machine (passes
make func). Will hopefully have time to test on a NUMA box tomorrow.

Still not Signed-off.

I was really trying to avoid using libnuma, but I ended up just stealing
code from it. I've trimmed it down to the minimal amount, but am open
for further (or better) suggestions.

Thanks,
Nish

diff --git a/morecore.c b/morecore.c
index 9f13316..01ac8ae 100644
--- a/morecore.c
+++ b/morecore.c
@@ -26,6 +26,11 @@
 #include <sys/mman.h>
 #include <errno.h>
 #include <dlfcn.h>
+#include <string.h>
+#include <numaif.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <dirent.h>
 
 #include "hugetlbfs.h"
 
@@ -49,10 +54,94 @@ static long mapsize;
  * go back to small pages and use mmap to get them.  Hurrah.
  */
 
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES  128
+#else
+#define NUMA_NUM_NODES  2048
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG  (8*sizeof(unsigned long))
+#endif
+
+static int respect_policy;
+/* adapted from libnuma source */
+static int numa_is_available;
+static unsigned long nodemask[NUMA_NUM_NODES/BITS_PER_LONG];
+
+static int numa_max_node(void)
+{
+       DIR *d;
+       struct dirent *de;
+       int found, maxnode = 0;
+
+       d = opendir("/sys/devices/system/node");
+       if (!d)
+               return 0;
+       found = 0;
+       while ((de = readdir(d)) != NULL) {
+               int nd;
+               if (strncmp(de->d_name, "node", 4))
+                       continue;
+               found++;
+               nd = strtoul(de->d_name+4, NULL, 0);
+               if (maxnode < nd)
+                       maxnode = nd;
+       }
+       closedir(d);
+       if (found == 0)
+               return 0;
+       return maxnode;
+}
+
+static void setup_numa_if_available(void)
+{
+       int i, maxnode;
+
+       if (syscall(__NR_get_mempolicy, NULL, NULL, 0, 0, 0) < 0
+                                                       && errno == ENOSYS) {
+               numa_is_available = 0;
+               return;
+       }
+
+       numa_is_available = 1;
+
+       maxnode = numa_max_node();
+       for (i = 0; i <= maxnode; i++)
+               nodemask[i / BITS_PER_LONG] |= (1UL<<(i%BITS_PER_LONG));
+}
+
+static int guarantee_memory(void *p, long size)
+{
+       int ret;
+
+       /*
+        * Override the NUMA policy unless told not to by the environment
+        *
+        * Default to interleaving at fault-time to avoid having all the
+        * hugepages being allocated on the current node.
+        */
+       if (numa_is_available && (respect_policy == 0))
+               if (syscall(__NR_mbind, p, size, MPOL_INTERLEAVE, nodemask,
+                                               NUMA_NUM_NODES+1, 0) < 0)
+                       WARNING("mbind() failed: %s\n", strerror(errno));
+
+       ret = mlock(p, size);
+       if (ret < 0) {
+               WARNING("mlock() failed: %s\n",
+                               strerror(errno));
+               return ret;
+       }
+       munlock(p, size);
+
+       return 0;
+}
+
 static void *hugetlbfs_morecore(ptrdiff_t increment)
 {
        void *p;
        long newsize = 0;
+       int ret;
 
        DEBUG("hugetlbfs_morecore(%ld) = ...\n", (long)increment);
 
@@ -86,20 +175,14 @@ static void *hugetlbfs_morecore(ptrdiff_
                        return NULL;
                }
 
-#if 0
-/* Use of mlock is disabled because it results in bad numa behavior since
- * the malloc'd memory is allocated node-local to the cpu calling morecore()
- * and not to the cpu(s) that are actually using the memory.
- */
-               /* Use mlock to guarantee these pages to the process */
-               ret = mlock(p, newsize);
-               if (ret) {
+
+               /* Use mbind and mlock to guarantee these pages to the process 
*/
+               ret = guarantee_memory(p, newsize);
+               if (ret < 0) {
                        WARNING("Failed to reserve huge pages in 
hugetlbfs_morecore()\n");
                        munmap(p, newsize);
                        return NULL;
                }
-               munlock(p, newsize);
-#endif
 
                mapsize += newsize;
        }
@@ -120,6 +203,10 @@ static void __attribute__((constructor))
        if (! env)
                return;
 
+       env = getenv("HUGETLB_MORECORE_RESPECT_POLICY");
+       if (env)
+               respect_policy = atoi(env);
+
        blocksize = gethugepagesize();
        if (! blocksize) {
                ERROR("Hugepages unavailable\n");
@@ -157,4 +244,6 @@ static void __attribute__((constructor))
         * This doesn't appear to prohibit malloc() from falling back
         * to mmap() if we run out of hugepages. */
        mallopt(M_MMAP_MAX, 0);
+
+       setup_numa_if_available();
 }

-- 
Nishanth Aravamudan <[EMAIL PROTECTED]>
IBM Linux Technology Center

-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Libhugetlbfs-devel mailing list
Libhugetlbfs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to