Hi,

Here is my attempt at reinstating the mlocking guarantee for morecore.
The issue previously was that we would fault in hugepages on the current
node only, leading to terrible NUMA performance. Instead, we now check
the current mempolicy and if it's DEFAULT (which acc'g to the man mbind
page means "Unless the process policy has been changed this means to
allocate memory on the node of the CPU that triggered the allocation.")
we change it to INTERLEAVE. I think we want to respect the policy if
it's BIND or PREFERRED, although maybe only the latter is really
important.

The NUMA API man-pages are really bad, so I'll probably spend some time
now creating patches for them, based upon my reading of the
corresponding kernel code.

Unfortunately, this would introduce a dependency on libnuma, as
otherwise the get_mempolicy() and mbind() calls have no definition :( So
I'm emulating them with indirect syscalls.

I'm going to go and test this now on a non-NUMA machine until I can find
access to a larger NUMA machine where this might make a difference, but
wanted to get the patch out there, because I'm not entirely sure I know
what I'm doing :)

Completely only an RFC right now, not requesting inclusion, so not
Signed-off.

---

diff --git a/morecore.c b/morecore.c
index 9f13316..ae446e2 100644
--- a/morecore.c
+++ b/morecore.c
@@ -26,6 +26,9 @@
 #include <sys/mman.h>
 #include <errno.h>
 #include <dlfcn.h>
+#include <string.h>
+#include <numaif.h>
+#include <sys/syscall.h>
 
 #include "hugetlbfs.h"
 
@@ -49,10 +52,54 @@ static long mapsize;
  * go back to small pages and use mmap to get them.  Hurrah.
  */
 
+static int guarantee_memory(void *p, long size)
+{
+       int policy, ret;
+       unsigned long nodemask;
+       unsigned long maxnode = sizeof(nodemask) * 8 + 1;
+       unsigned long flags = MPOL_F_ADDR;
+
+       ret = syscall(__NR_get_mempolicy, &policy, &nodemask, maxnode, p, 
flags);
+       if (ret < 0) {
+               WARNING("get_mempolicy() failed in guarantee_memory(): %s\n",
+                                                       strerror(errno));
+               return ret;
+       }
+
+       /*
+        * If the NUMA policy hasn't been set, then we default to
+        * interleaving at fault-time to avoid having all the hugepages
+        * being allocated on the current node.
+        */
+       if (policy == MPOL_DEFAULT) {
+               printf("DEFAULT policy: nodemask = %lx, maxnode = %lu\n",
+                                                       nodemask, maxnode);
+               policy = MPOL_INTERLEAVE;
+       }
+
+       ret = syscall(__NR_mbind, p, size, policy, &nodemask, maxnode, flags);
+       if (ret < 0) {
+               WARNING("mbind() failed in guarantee_memory(): %s\n",
+                                                       strerror(errno));
+               return ret;
+       }
+
+       ret = mlock(p, size);
+       if (ret < 0) {
+               WARNING("mlock() failed in guarantee_memory(): %s\n",
+                                                       strerror(errno));
+               return ret;
+       }
+       munlock(p, size);
+
+       return 0;
+}
+
 static void *hugetlbfs_morecore(ptrdiff_t increment)
 {
        void *p;
        long newsize = 0;
+       int ret;
 
        DEBUG("hugetlbfs_morecore(%ld) = ...\n", (long)increment);
 
@@ -86,20 +133,13 @@ static void *hugetlbfs_morecore(ptrdiff_
                        return NULL;
                }
 
-#if 0
-/* Use of mlock is disabled because it results in bad numa behavior since
- * the malloc'd memory is allocated node-local to the cpu calling morecore()
- * and not to the cpu(s) that are actually using the memory.
- */
-               /* Use mlock to guarantee these pages to the process */
-               ret = mlock(p, newsize);
-               if (ret) {
+               /* Use mbind and mlock to guarantee these pages to the process 
*/
+               ret = guarantee_memory(p, newsize);
+               if (ret < 0) {
                        WARNING("Failed to reserve huge pages in 
hugetlbfs_morecore()\n");
                        munmap(p, newsize);
                        return NULL;
                }
-               munlock(p, newsize);
-#endif
 
                mapsize += newsize;
        }

-- 
Nishanth Aravamudan <[EMAIL PROTECTED]>
IBM Linux Technology Center

-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Libhugetlbfs-devel mailing list
Libhugetlbfs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to