From 8df81d73adee7dfae324445df089beeaec4d88df Mon Sep 17 00:00:00 2001
From: Doug Gilmore <Doug.Gilmore@amd.com>
Date: Thu, 23 Aug 2012 14:22:25 -0700
Subject: [PATCH 2/2] Change how libhugetlbfs handles a memory allocation that
 exceeds a huge page limit setting.

When a program segment or heap allocation would exhaust a
huge page environment variable limit setting (see below),
the allocation will now use huge pages to exhaust the limit.
Then small pages are then used satisfy the rest of the
current memory allocation.

When using huge pages to map program segments, the size
checks now properly account for COW faults on the pages for
initialized data.

The environment variable HUGETLB_LIMIT controls the number
of huge pages that can be mapped by a process via program
segment mapping and heap allocation.  This environment
variable is now also inspected during when program segments
are being mapped via huge pages.  Previously only the
environment variable HUGETLB_ELF_LIMIT would be inspected
when mapping program segments.  If both environment
variables are set then the effective huge page limit for
mapping program segments becomes the minimum of the two
settings).

This means that setting HUGETLB_LIMIT to 0 will now
completely disable the use of huge pages.  Beforehand to do
this, one also needed to set HUGETLB_ELF_LIMIT to 0.

Also I cleaned up some of the comments.
---
 osprey/libhugetlbfs/elflink.c  |  105 ++++++++++++++++++++++++++++++++++++----
 osprey/libhugetlbfs/morecore.c |   41 ++++++++++++++--
 2 files changed, 131 insertions(+), 15 deletions(-)

diff --git a/osprey/libhugetlbfs/elflink.c b/osprey/libhugetlbfs/elflink.c
index 330e141..5946c4f 100644
--- a/osprey/libhugetlbfs/elflink.c
+++ b/osprey/libhugetlbfs/elflink.c
@@ -67,6 +67,7 @@
 static long hugepages_total = 0;
 static long hugepages_elf_limit = -1;
 static long hugepages_avail;
+static int map_private;
 
 #endif
 
@@ -165,6 +166,15 @@ static char share_path[PATH_MAX+1];
 struct seg_info {
 	void *vaddr;
 	unsigned long filesz, memsz, extrasz;
+#ifdef OPEN64_MOD
+	 /* If the number of huge pages is set by one of the
+	  * environment variables HUGETLB_LIMIT or HUGETLB_ELF_LIMIT
+	  * (or the minimum if both are set) is less then the number
+	  * needed, then spsz represents the residual that will be
+	  * allocated by small pages.
+	  */
+	unsigned long spsz;
+#endif
 	int prot;
 	int fd;
 	int index;
@@ -987,12 +997,15 @@ static void remap_segments(struct seg_info *seg, int num)
         unsigned long oldbrk;
         unsigned long seg_start = 0;
         unsigned total_pages = 0;
-	int map_private = hugepage_elf_stype == SIZE_2M;
+        char *env;
+        int limit = hugepages_elf_limit;
+
+        if ((env = getenv("HUGETLB_LIMIT")) != NULL) {
+            int val = atoi(env);
+            if (limit > val)
+                limit = val;
+        }
 
-	if (getenv("HUGETLB_ELF_MAP_SHARED") != NULL) {
-	    DEBUG("HUGETLB_ELF_MAP_SHARED set\n");
-	    map_private = 0;
-	}
         newbrk = oldbrk = (unsigned long) sbrk(0);
         DEBUG("Old brk=0x%lx\n", oldbrk);
 
@@ -1014,7 +1027,6 @@ static void remap_segments(struct seg_info *seg, int num)
                 seg_start = slice_start;
 
 	    if (seg[i].prot & PROT_WRITE) {
-		char *env;
 		int npages;
 		unsigned long memsz;
 
@@ -1051,9 +1063,27 @@ static void remap_segments(struct seg_info *seg, int num)
 		}
             }
 	    pages = (seg[i].memsz + hpage_size - 1 ) / hpage_size ;
-            DEBUG("reserving %ld pages for segment %d\n", pages, i);
+	    if (pages + total_pages > limit) {
+		if (! (seg[i].prot & PROT_WRITE)) {
+		    unmapped_abort("HUGETLB_LIMIT constrains text allocation");
+		}
+		/* Make sure we don't zero any initialized data.
+		 */
+		if (hugepage_elf_stype == SIZE_2M && cow_pages + total_pages > limit)
+		    unmapped_abort("HUGETLB_LIMIT constrains initialized data allocation");
+		long hp_pages = limit - total_pages;
+		/* Note that  pages = hp_pages + sp_pages, thus */
+		long sp_pages = pages - hp_pages;
+		DEBUG("mapping %ld huge pages for segment %d\n", hp_pages, i);
+		DEBUG("Due to limit, mapping %ld pages will not be mapped as huge\n", sp_pages);
+		seg[i].memsz = hp_pages * hpage_size;
+		seg[i].spsz = sp_pages * hpage_size;
+		pages = hp_pages;
+	    }
 	    total_pages += pages;
-        }
+            DEBUG("reserving %ld pages for segment %d\n", pages, i);
+	}
+
         if (hugepage_elf_stype == SIZE_1G) {
             __hugetlbfs_setup_bd_morecore();
         }
@@ -1136,6 +1166,24 @@ static void remap_segments(struct seg_info *seg, int num)
 			unmapped_abort("Mapped hugepage segment %u (%p-%p) at "
 				       "wrong address %p\n", i, seg[i].vaddr,
 				       seg[i].vaddr+mapsize, p);
+		if (seg[i].spsz) {
+			/* The limit has been exhausted, allocate the rest using
+			 * small pages.
+			 */
+			void *sp_vaddr = seg[i].vaddr + seg[i].memsz, *sp_p;
+			sp_p = mmap(sp_vaddr,
+				    seg[i].spsz, seg[i].prot,
+				    MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+			if (sp_p == MAP_FAILED)
+				unmapped_abort("Failed to map hugepage segment %u: "
+					       "%p-%p (errno=%u)\n", i, sp_vaddr,
+					       sp_vaddr + seg[i].spsz, errno);
+			if (sp_p != sp_vaddr) {
+				unmapped_abort("Mapped hugepage segment %u (%p-%p) at "
+					       "wrong address %p\n", i, sp_vaddr,
+					       sp_vaddr + seg[i].spsz, sp_vaddr);
+			}
+		}
 	}
 
 	/* The segments are all back at this point.
@@ -1293,6 +1341,16 @@ void __hugetlbfs_setup_elflink(void)
             if ((n >= 0) && ( n < hugepages_avail))
                 hugepages_elf_limit = n;
         }
+
+	/* HUGETLB_LIMIT now constrains segment mapping
+	 * so we also check it.
+	 */
+        env = getenv("HUGETLB_LIMIT");
+        if (env) {
+            long n = atol(env);
+            if ((n >= 0) && ( n < hugepages_avail) && n < hugepages_elf_limit)
+                hugepages_elf_limit = n;
+        }
         hpage_size = gethugepagesize();
 #else
         hpage_size = gethugepagesize();
@@ -1311,14 +1369,41 @@ void __hugetlbfs_setup_elflink(void)
 	DEBUG("libhugetlbfs version: %s\n", VERSION);
 
 #ifdef OPEN64_MOD
+        /* Note using private mapping with 1GB page is a huge waste,
+         * since a COW fault will cause an addition 1GB page to be
+         * allocated.
+         * 
+         * However there is an important semantic effect for private
+         * mappings: because of COW semantics, if the process forks
+         * then the child process's stores to the HP mapped data
+         * segment not be seen by the parent.
+         *
+         * Thus when the data segment is mapped shared (the default
+         * for 1GB pages), one should be very leery of using fork().
+         * In this situation, to exec a child process, it is probably
+         * best to use vfork() instead.
+         */
+        map_private = hugepage_elf_stype == SIZE_2M;
+        if (getenv("HUGETLB_ELF_MAP_SHARED") != NULL) {
+            DEBUG("HUGETLB_ELF_MAP_SHARED set\n");
+            map_private = 0;
+        }
         for (i = 0; i < htlb_num_segs; i++) {
             struct seg_info  seg = htlb_seg_table[i];
             unsigned long seg_hpages = ALIGN((seg.filesz + seg.extrasz), hpage_size) / hpage_size;
 
             DEBUG("seg index %d used %lu pages\n", i, seg_hpages);
             hugepages_total += seg_hpages;
-            
-            if (hugepages_total >= hugepages_elf_limit) {
+
+            if (map_private && (seg.prot & PROT_WRITE)) {
+                DEBUG("seg index %d reserving %lu pages for COW\n", i, seg_hpages);
+                hugepages_total += seg_hpages;
+            }
+            /* This test was originally written as greater than or equal to
+             * instead of greater than, since it was coded at the time that
+             * the COW pages was not being account for.
+             */
+            if (hugepages_total > hugepages_elf_limit) {
                 WARNING("ELF Segments require %ld huge pages, exceed huge page limit %ld.\n",
                         hugepages_total,
                         hugepages_elf_limit);
diff --git a/osprey/libhugetlbfs/morecore.c b/osprey/libhugetlbfs/morecore.c
index ab591b0..cbfd17f 100644
--- a/osprey/libhugetlbfs/morecore.c
+++ b/osprey/libhugetlbfs/morecore.c
@@ -111,6 +111,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
 	int ret;
 	void *p;
 	long delta;
+	long sp_delta = 0;
 
 #ifdef OPEN64_MOD
         long p_delta = 0;
@@ -156,11 +157,24 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
 
 #ifdef OPEN64_MOD
                 if ((long long) delta + mapsize > (long long) hugepages_heap_limit * blocksize) {
-                    DEBUG("size %ld exceeds huge page limit %ld\n",
-                          delta + mapsize, hugepages_heap_limit);
-                    __morecore = &__default_morecore;
-                    DEBUG("brk=0x%lx\n",(unsigned long) sbrk(0));
-                    return NULL;
+			sp_delta = delta;
+			delta = (long long) hugepages_heap_limit * blocksize - mapsize ;
+			if (delta == 0) {
+				/* Calling mmap with zero size will fail so, just return.
+				 */
+				DEBUG("size %ld exceeds huge page limit %ld\n",
+				      delta + mapsize, hugepages_heap_limit);
+				__morecore = &__default_morecore;
+				DEBUG("brk=0x%lx\n",(unsigned long) sbrk(0));
+				return NULL;
+			}
+			sp_delta -= delta;
+			DEBUG("size %ld (0x%lx) exceeds huge page limit %ld\n",
+			      delta + mapsize, delta + mapsize, hugepages_heap_limit);
+			DEBUG("allocate %ld (0x%lx) bytes as large pages\n",
+			      delta, delta);
+			DEBUG("allocate %ld (0x%lx) bytes via brk()\n",
+			      sp_delta, sp_delta);
                 }
 #endif
 
@@ -236,6 +250,23 @@ static void *hugetlbfs_morecore(ptrdiff_t increment)
 				}
 			}
 		}
+                if (sp_delta) {
+			/* It is important that brk is used here, otherwise the
+			 * glibc malloc may determine that the memory being
+			 * allocated is non-contiguous and force relocation,
+			 * thus wasting the memory that we have just allocated.
+			 */
+			void *new_brk = heapbase + mapsize + delta + sp_delta;
+
+			if (brk(new_brk)) {
+				WARNING("setting break to %p failed %s\n",
+					new_brk, strerror(errno));
+				munmap(p, delta);
+				return NULL;
+			}
+			__morecore = &__default_morecore;
+			delta += sp_delta;
+                }
 
 		/* we now have mmap'd further */
 		mapsize += delta;
-- 
1.7.4.5

