elflink: partial segment remapping

Add support for partial segment remapping, where non-relinked binaries
can try to take advantage of libhugetlbfs' segment remapping code. By
LD_PRELOAD'ing the library and specifying the __executable_start address
in the HUGETLB_FORCE_REMAP variable, the library mimics the behavior it
would have if the binary were relinked with our linker scripts.

A few caveats: this is only useful for binaries with already *very*
large segments, especially on power. We can only really use this partial
remapping algorithm if the segment size is larger than the granularity
at which hugepages can be used. This is because we have to be very
careful not to reduce the available address space or to violate any
contiguity rules. Given power's restriction of one page-size per 256M
area, this requires very large segments. x86 and x86_64 are not so
seriously impacted, as a several-MB large array is sufficient to allow
the remapping to occur.

This feature is mutually exclusive to MINIMAL_COPY, because we are
PRELOAD'd and so do not know what may or may not be uninitialized
anymore.

Signed-off-by: Nishanth Aravamudan <[EMAIL PROTECTED]>

---
I'm not sure if my statement above about PRELOAD is accurate. Since we
are PRELOAD'd in partial remapping, we are the last thing to run before
program execution starts. So we should be able to do the same sort of
BSS checking that we do now? I can try messing with some apps and see if
maybe MINIMAL_COPY is possible. Would also end up cleaning up
check_env() a bit.

diff --git a/HOWTO b/HOWTO
index 8c13d46..ee96c69 100644
--- a/HOWTO
+++ b/HOWTO
@@ -310,6 +310,10 @@ libhugetlbfs:
                otherwise, only the necessary parts will be, which can
                be much more efficient (default)
 
+       HUGETLB_FORCE_ELFMAP
+       HUGETLB_FORCE_ELFMAP_EXECSTART
+               Explained in "Partial segment remapping"
+
        HUGETLB_MORECORE
        HUGETLB_MORECORE_HEAPBASE
                Explained in "Using hugepages for malloc()
@@ -373,6 +377,33 @@ manually deleted to free the hugepages in question.  
Future versions
 of libhugetlbfs should include tools and scripts to automate this
 cleanup.
 
+       Partial segment remapping
+       -------------------------
+
+libhugetlbfs has limited support for remapping a normal, non-relinked
+binary's data, text and BSS into hugepages. To enable this feature,
+HUGETLB_FORCE_ELFMAP must be set to "yes", and
+HUGETLB_FORCE_ELFMAP_EXECSTART must contain the address at which
+__executable_start can be found in the binary. This is typically the
+same address for all binaries on the system and can often be found by
+examining the system linker scripts in /usr/lib/ldscripts.
+
+Partial segment remapping is not guaranteed to work. Most importantly, a
+binary's segments must be large enough even when not relinked by
+libhugetlbfs:
+
+       architecture    address         minimum segment size
+       ------------    -------         --------------------
+       i386, x86_64    all             hugepage size
+       ppc32           all             256M
+       ppc64           0-4G            256M
+       ppc64           4G-1T           1020G
+       ppc64           1T+             1T
+
+The raw size, though, is not sufficient to indicate if the code will
+succeed, due to alignment. Since the binary is not relinked, however,
+this is relatively straightforward to 'test and see'.
+
 Examples
 ========
 
diff --git a/elflink.c b/elflink.c
index 3eb61c1..1591c0f 100644
--- a/elflink.c
+++ b/elflink.c
@@ -160,6 +160,7 @@ static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
 static int htlb_num_segs;
 static int minimal_copy = 1;
 static int sharing; /* =0 */
+static unsigned long force_remap; /* =0 */
 int __debug = 0;
 
 /**
@@ -492,11 +493,7 @@ bail2:
        seg->extrasz = end_orig - start_orig;
 }
 
-/*
- * Parse an ELF header and record segment information for any segments
- * which contain hugetlb information.
- */
-static void parse_elf(Elf_Ehdr *ehdr)
+static void parse_elf_relinked(Elf_Ehdr *ehdr)
 {
        Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
        int i;
@@ -544,6 +541,138 @@ static void parse_elf(Elf_Ehdr *ehdr)
        }
 }
 
+#define ALIGN_UP(x,a)  (((x) + (a)) & ~((a) - 1))
+#define ALIGN_DOWN(x,a) ((x) & ~((a) - 1))
+
+#if defined(__powerpc64__) || defined (__powerpc__)
+#define SLICE_LOW_SHIFT                28
+#define SLICE_HIGH_SHIFT       40
+
+#define SLICE_LOW_TOP          (0x100000000UL)
+#define SLICE_LOW_SIZE         (1UL << SLICE_LOW_SHIFT)
+#define SLICE_HIGH_SIZE                (1UL << SLICE_HIGH_SHIFT)
+#endif
+
+/*
+ * Return the address of the start and end of the hugetlb slice
+ * containing @addr. A slice is a range of addresses, start inclusive
+ * and end exclusive.
+ * Note, that since relinking is not supported on ia64, we can leave it
+ * out here.
+ */
+static unsigned long hugetlb_slice_start(unsigned long addr)
+{
+#if defined(__powerpc64__)
+       if (addr < SLICE_LOW_TOP)
+               return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
+       else if (addr < SLICE_HIGH_SIZE)
+               return SLICE_LOW_TOP;
+       else
+               return ALIGN_DOWN(addr, SLICE_HIGH_SIZE);
+#elif defined(__powerpc__)
+       return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
+#else
+       return ALIGN_DOWN(addr, gethugepagesize());
+#endif
+}
+
+static unsigned long hugetlb_slice_end(unsigned long addr)
+{
+#if defined(__powerpc64__)
+       if (addr < SLICE_LOW_TOP)
+               return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
+       else
+               return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1;
+#elif defined(__powerpc__)
+       return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
+#else
+       return ALIGN_UP(addr, gethugepagesize()) - 1;
+#endif
+}
+
+static unsigned long hugetlb_next_slice_start(unsigned long addr)
+{
+       return hugetlb_slice_end(addr) + 1;
+}
+
+static unsigned long hugetlb_prev_slice_end(unsigned long addr)
+{
+       return hugetlb_slice_start(addr) - 1;
+}
+
+static void parse_elf_normal(Elf_Ehdr *ehdr)
+{
+       Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
+       int i;
+
+       for (i = 0; i < ehdr->e_phnum && htlb_num_segs < MAX_HTLB_SEGS; i++) {
+               unsigned long vaddr, filesz, memsz, gap;
+               unsigned long slice_end;
+               int prot = 0;
+
+               if (phdr[i].p_type != PT_LOAD)
+                       continue;
+
+               /*
+                * Partial segment remapping only makes sense if the
+                * memory size of the segment is larger than the
+                * granularity at which hugepages can be used. This
+                * mostly affects ppc, where the segment must be larger
+                * than 256M. This guarantees that remapping the binary
+                * in this forced way won't violate any contiguity
+                * constraints.
+                */
+               vaddr = hugetlb_next_slice_start(phdr[i].p_vaddr);
+               gap = vaddr - phdr[i].p_vaddr;
+               slice_end = hugetlb_slice_end(vaddr);
+               /*
+                * we should stop remapping just before the slice
+                * containing the end of the memsz portion (taking away
+                * the gap of the memsz)
+                */
+               memsz = phdr[i].p_memsz;
+               if (memsz < gap) {
+                       DEBUG("Segment %d's unaligned memsz is too small: "
+                                       "%#0lx < %#0lx\n",
+                                       i, memsz, gap);
+                       continue;
+               }
+               memsz -= gap;
+               if (memsz < (slice_end - vaddr)) {
+                       DEBUG("Segment %d's aligned memsz is too small: "
+                                       "%#0lx < %#0lx\n",
+                                       i, memsz, slice_end - vaddr);
+                       continue;
+               }
+               memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr;
+
+               /*
+                * minimal_copy is disabled so just set filesz to memsz,
+                * to avoid issues in prepare
+                */
+               filesz = memsz;
+
+               if (phdr[i].p_flags & PF_R)
+                       prot |= PROT_READ;
+               if (phdr[i].p_flags & PF_W)
+                       prot |= PROT_WRITE;
+               if (phdr[i].p_flags & PF_X)
+                       prot |= PROT_EXEC;
+
+               DEBUG("Hugepage segment %d (phdr %d): %#0lx-%#0lx "
+                               "(filesz=%#0lx) " "(prot = %#0x)\n",
+                               htlb_num_segs, i, vaddr, vaddr+memsz,
+                               filesz, prot);
+
+               htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
+               htlb_seg_table[htlb_num_segs].filesz = filesz;
+               htlb_seg_table[htlb_num_segs].memsz = memsz;
+               htlb_seg_table[htlb_num_segs].prot = prot;
+               htlb_seg_table[htlb_num_segs].index = i;
+               htlb_num_segs++;
+       }
+}
+
 /*
  * Copy a program segment into a huge page. If possible, try to copy the
  * smallest amount of data possible, unless the user disables this 
@@ -551,7 +680,7 @@ static void parse_elf(Elf_Ehdr *ehdr)
  */
 static int prepare_segment(struct seg_info *seg)
 {
-       long hpage_size = gethugepagesize();
+       int hpage_size = gethugepagesize();
        void *p;
        unsigned long gap;
        unsigned long size;
@@ -802,9 +931,18 @@ static void remap_segments(struct seg_info *seg, int num)
         */
 }
 
+static int is_valid_elf(Elf_Ehdr *ehdr)
+{
+       return ehdr->e_ident[EI_MAG0] == 0x7f &&
+               ehdr->e_ident[EI_MAG1] == 'E' &&
+               ehdr->e_ident[EI_MAG2] == 'L' &&
+               ehdr->e_ident[EI_MAG3] == 'F';
+}
+
 static int check_env(void)
 {
        char *env;
+       extern Elf_Ehdr __executable_start __attribute__((weak));
 
        env = getenv("HUGETLB_ELFMAP");
        if (env && (strcasecmp(env, "no") == 0)) {
@@ -815,13 +953,49 @@ static int check_env(void)
 
        env = getenv("LD_PRELOAD");
        if (env && strstr(env, "libhugetlbfs")) {
-               ERROR("LD_PRELOAD is incompatible with segment remapping\n");
-               ERROR("Segment remapping has been DISABLED\n");
-               return -1;
+               char *env2, *env3, *ep;
+               env2 = getenv("HUGETLB_FORCE_ELFMAP");
+               if (env2 && (strcasecmp(env2, "yes") == 0)) {
+                       env3 = getenv("HUGETLB_FORCE_ELFMAP_EXECSTART");
+                       if (!env3) {
+                               ERROR("HUGETLB_FORCE_ELFMAP=%s, but "
+                                               "HUGETLB_FORCE_ELFMAP_EXECSTART 
"
+                                               "is unset\n", env2);
+                               return -1;
+                       }
+                       force_remap = strtoul(env3, &ep, 16);
+                       if (*ep != '\0') {
+                               ERROR("Can't parse 
HUGETLB_FORCE_ELFMAP_EXECSTART: "
+                                               "%s\n", strerror(errno));
+                               return -1;
+                       }
+                       if (!is_valid_elf((Elf_Ehdr *)force_remap)) {
+                               DEBUG("The address passed in "
+                                               "HUGETLB_FORCE_ELFMAP_EXECSTART 
"
+                                               "(%#0lx) is not the "
+                                               "location of a valid ELF "
+                                               "header\n", force_remap);
+                               return -1;
+                       }
+                       DEBUG("HUGETLB_FORCE_ELFMAP=%s, "
+                                       "HUGETLB_FORCE_ELFMAP_EXECSTART=%#0lx, "
+                                       "enabling partial segment "
+                                       "remapping for non-relinked "
+                                       "binaries\n",
+                                       env2, force_remap);
+                       DEBUG("Disabling filesz copy optimization\n");
+                       minimal_copy = 0;
+               } else {
+                       if (&__executable_start) {
+                               ERROR("LD_PRELOAD is incompatible with segment 
remapping\n");
+                               ERROR("Segment remapping has been DISABLED\n");
+                               return -1;
+                       }
+               }
        }
 
        env = getenv("HUGETLB_MINIMAL_COPY");
-       if (env && (strcasecmp(env, "no") == 0)) {
+       if (minimal_copy && env && (strcasecmp(env, "no") == 0)) {
                DEBUG("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy "
                        "optimization\n", env);
                minimal_copy = 0;
@@ -854,29 +1028,51 @@ static int check_env(void)
        return 0;
 }
 
-static void __attribute__ ((constructor)) setup_elflink(void)
+/*
+ * Parse an ELF header and record segment information for any segments
+ * which contain hugetlb information.
+ */
+static int parse_elf()
 {
        extern Elf_Ehdr __executable_start __attribute__((weak));
-       Elf_Ehdr *ehdr = &__executable_start;
-       int ret, i;
 
-       if (! ehdr) {
-               DEBUG("Couldn't locate __executable_start, "
-                     "not attempting to remap segments\n");
-               return;
+       /* a normal, not relinked binary */
+       if (! (&__executable_start)) {
+               if (force_remap) {
+                       parse_elf_normal((Elf_Ehdr *)force_remap);
+                       if (htlb_num_segs == 0) {
+                               DEBUG("No segments were appropriate for "
+                                               "partial remapping\n");
+                               return -1;
+                       }
+               } else {
+                       DEBUG("Couldn't locate __executable_start, "
+                               "not attempting to remap segments\n");
+                       return -1;
+               }
+       } else {
+               parse_elf_relinked(&__executable_start);
+               if (htlb_num_segs == 0) {
+                       DEBUG("No segments were appropriate for partial "
+                                       "remapping\n");
+                       return -1;
+               }
        }
 
-       if (check_env())
-               return;
+       return 0;
+}
 
-       DEBUG("libhugetlbfs version: %s\n", VERSION);
+static void __attribute__ ((constructor)) setup_elflink(void)
+{
+       int i, ret;
 
-       parse_elf(ehdr);
+       if (check_env())
+               return;
 
-       if (htlb_num_segs == 0) {
-               DEBUG("Executable is not linked for hugepage segments\n");
+       if (parse_elf())
                return;
-       }
+
+       DEBUG("libhugetlbfs version: %s\n", VERSION);
 
        /* Do we need to find a share directory */
        if (sharing) {

-- 
Nishanth Aravamudan <[EMAIL PROTECTED]>
IBM Linux Technology Center

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to