elflink: partial segment remapping
Add support for partial segment remapping, where non-relinked binaries
can try to take advantage of libhugetlbfs' segment remapping code. By
LD_PRELOAD'ing the library and specifying the __executable_start address
in the HUGETLB_FORCE_REMAP variable, the library mimics the behavior it
would have if the binary were relinked with our linker scripts.
A few caveats: this is only useful for binaries with already *very*
large segments, especially on power. We can only really use this partial
remapping algorithm if the segment size is larger than the granularity
at which hugepages can be used. This is because we have to be very
careful not to reduce the available address space or to violate any
contiguity rules. Given power's restriction of one page-size per 256M
area, this requires very large segments. x86 and x86_64 are not so
seriously impacted, as a several-MB large array is sufficient to allow
the remapping to occur.
This feature is mutually exclusive to MINIMAL_COPY, because we are
PRELOAD'd and so do not know what may or may not be uninitialized
anymore.
Signed-off-by: Nishanth Aravamudan <[EMAIL PROTECTED]>
diff --git a/HOWTO b/HOWTO
index 8c13d46..ee96c69 100644
--- a/HOWTO
+++ b/HOWTO
@@ -310,6 +310,10 @@ libhugetlbfs:
otherwise, only the necessary parts will be, which can
be much more efficient (default)
+ HUGETLB_FORCE_ELFMAP
+ HUGETLB_FORCE_ELFMAP_EXECSTART
+ Explained in "Partial segment remapping"
+
HUGETLB_MORECORE
HUGETLB_MORECORE_HEAPBASE
Explained in "Using hugepages for malloc()
@@ -373,6 +377,33 @@ manually deleted to free the hugepages in question.
Future versions
of libhugetlbfs should include tools and scripts to automate this
cleanup.
+ Partial segment remapping
+ -------------------------
+
+libhugetlbfs has limited support for remapping a normal, non-relinked
+binary's data, text and BSS into hugepages. To enable this feature,
+HUGETLB_FORCE_ELFMAP must be set to "yes", and
+HUGETLB_FORCE_ELFMAP_EXECSTART must contain the address at which
+__executable_start can be found in the binary. This is typically the
+same address for all binaries on the system and can often be found by
+examining the system linker scripts in /usr/lib/ldscripts.
+
+Partial segment remapping is not guaranteed to work. Most importantly, a
+binary's segments must be large enough even when not relinked by
+libhugetlbfs:
+
+ architecture address minimum segment size
+ ------------ ------- --------------------
+ i386, x86_64 all hugepage size
+ ppc32 all 256M
+ ppc64 0-4G 256M
+ ppc64 4G-1T 1020G
+ ppc64 1T+ 1T
+
+The raw size, though, is not sufficient to indicate if the code will
+succeed, due to alignment. Since the binary is not relinked, however,
+this is relatively straightforward to 'test and see'.
+
Examples
========
diff --git a/elflink.c b/elflink.c
index 5058248..cc1631d 100644
--- a/elflink.c
+++ b/elflink.c
@@ -159,6 +159,7 @@ static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
static int htlb_num_segs;
static int minimal_copy = 1;
static int sharing; /* =0 */
+static unsigned long force_remap;
int __debug = 0;
/**
@@ -480,11 +481,7 @@ bail2:
seg->extrasz = end_orig - start_orig;
}
-/*
- * Parse an ELF header and record segment information for any segments
- * which contain hugetlb information.
- */
-static void parse_elf(Elf_Ehdr *ehdr)
+static void parse_elf_relinked(Elf_Ehdr *ehdr)
{
Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
int i;
@@ -532,6 +529,139 @@ static void parse_elf(Elf_Ehdr *ehdr)
}
}
+static unsigned long get_next_slice_start(unsigned long vaddr)
+{
+ if (vaddr == hugetlbfs_slice_start(vaddr))
+ return vaddr;
+ else
+ return hugetlbfs_slice_end(vaddr) + 1;
+}
+
+static unsigned long get_previous_slice_end(unsigned long vaddr)
+{
+ if (vaddr == hugetlbfs_slice_end(vaddr))
+ return vaddr;
+ else
+ return hugetlbfs_slice_start(vaddr) - 1;
+}
+
+static void parse_elf_normal(Elf_Ehdr *ehdr)
+{
+ Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
+ int i;
+
+ for (i = 0; i < ehdr->e_phnum && htlb_num_segs < MAX_HTLB_SEGS; i++) {
+ unsigned long vaddr, filesz, memsz, gap;
+ unsigned long slice_end;
+ int prot = 0;
+
+ if (phdr[i].p_type != PT_LOAD)
+ continue;
+
+ /*
+ * Partial segment remapping only makes sense if the
+ * memory size of the segment is larger than the
+ * granularity at which hugepages can be used. This
+ * mostly affects ppc, where the segment must be larger
+ * than 256M. This guarantees that remapping the binary
+ * in this forced way won't violate any contiguity
+ * constraints.
+ */
+ vaddr = get_next_slice_start(phdr[i].p_vaddr);
+ gap = vaddr - phdr[i].p_vaddr;
+ slice_end = hugetlbfs_slice_end(vaddr);
+ /*
+ * we should stop remapping just before the slice
+ * containing the end of the memsz portion (taking away
+ * the gap of the memsz)
+ */
+ memsz = phdr[i].p_memsz;
+ if (memsz < gap) {
+ DEBUG("Segment %d's unaligned memsz is too small: "
+ "%#0lx < %#0lx\n",
+ i, memsz, gap);
+ continue;
+ }
+ memsz -= gap;
+ if (memsz < (slice_end - vaddr)) {
+ DEBUG("Segment %d's aligned memsz is too small: "
+ "%#0lx < %#0lx\n",
+ i, memsz, slice_end - vaddr);
+ continue;
+ }
+ memsz = get_previous_slice_end(vaddr + memsz) - vaddr;
+
+ /*
+ * minimal_copy is disabled so just set filesz to memsz,
+ * to avoid issues in prepare
+ */
+ filesz = memsz;
+
+ if (phdr[i].p_flags & PF_R)
+ prot |= PROT_READ;
+ if (phdr[i].p_flags & PF_W)
+ prot |= PROT_WRITE;
+ if (phdr[i].p_flags & PF_X)
+ prot |= PROT_EXEC;
+
+ DEBUG("Hugepage segment %d (phdr %d): %#0lx-%#0lx "
+ "(filesz=%#0lx) " "(prot = %#0x)\n",
+ htlb_num_segs, i, vaddr, vaddr+memsz,
+ filesz, prot);
+
+ htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
+ htlb_seg_table[htlb_num_segs].filesz = filesz;
+ htlb_seg_table[htlb_num_segs].memsz = memsz;
+ htlb_seg_table[htlb_num_segs].prot = prot;
+ htlb_seg_table[htlb_num_segs].index = i;
+ htlb_num_segs++;
+ }
+}
+
+/*
+ * Parse an ELF header and record segment information for any segments
+ * which contain hugetlb information.
+ */
+static int parse_elf()
+{
+ extern Elf_Ehdr __executable_start __attribute__((weak));
+
+ /* a normal, not relinked binary */
+ if (! (&__executable_start)) {
+ if (force_remap) {
+ parse_elf_normal((Elf_Ehdr *)force_remap);
+ if (htlb_num_segs == 0) {
+ DEBUG("No segments were appropriate for "
+ "partial remapping\n");
+ return -1;
+ }
+ } else {
+ DEBUG("Couldn't locate __executable_start, "
+ "not attempting to remap segments\n");
+ return -1;
+ }
+ } else {
+ parse_elf_relinked(&__executable_start);
+ if (htlb_num_segs == 0) {
+ if (force_remap) {
+ DEBUG("Executable is not linked for hugepage
segments, "
+ "but partial segment remapping
enabled\n");
+ parse_elf_normal((Elf_Ehdr *)force_remap);
+ if (htlb_num_segs == 0) {
+ DEBUG("No segments were appropriate for
"
+ "partial remapping\n");
+ return -1;
+ }
+ } else {
+ DEBUG("Executable is not linked for hugepage
segments\n");
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
/*
* Copy a program segment into a huge page. If possible, try to copy the
* smallest amount of data possible, unless the user disables this
@@ -792,6 +922,14 @@ static void remap_segments(struct seg_info *seg, int num)
*/
}
+static int is_valid_elf(Elf_Ehdr *ehdr)
+{
+ return ehdr->e_ident[EI_MAG0] == 0x7f &&
+ ehdr->e_ident[EI_MAG1] == 'E' &&
+ ehdr->e_ident[EI_MAG2] == 'L' &&
+ ehdr->e_ident[EI_MAG3] == 'F';
+}
+
static int check_env(void)
{
char *env;
@@ -805,13 +943,47 @@ static int check_env(void)
env = getenv("LD_PRELOAD");
if (env && strstr(env, "libhugetlbfs")) {
- ERROR("LD_PRELOAD is incompatible with segment remapping\n");
- ERROR("Segment remapping has been DISABLED\n");
- return -1;
+ char *env2, *env3, *ep;
+ env2 = getenv("HUGETLB_FORCE_ELFMAP");
+ if (env2 && (strcasecmp(env2, "yes") == 0)) {
+ env3 = getenv("HUGETLB_FORCE_ELFMAP_EXECSTART");
+ if (!env3) {
+ ERROR("HUGETLB_FORCE_ELFMAP=%s, but "
+ "HUGETLB_FORCE_ELFMAP_EXECSTART
"
+ "is unset\n", env2);
+ return -1;
+ }
+ force_remap = strtoul(env3, &ep, 16);
+ if (*ep != '\0') {
+ ERROR("Can't parse
HUGETLB_FORCE_ELFMAP_EXECSTART: "
+ "%s\n", strerror(errno));
+ return -1;
+ }
+ if (!is_valid_elf((Elf_Ehdr *)force_remap)) {
+ DEBUG("The address passed in "
+ "HUGETLB_FORCE_ELFMAP_EXECSTART
"
+ "(%#0lx) is not the "
+ "location of a valid ELF "
+ "header\n", force_remap);
+ return -1;
+ }
+ DEBUG("HUGETLB_FORCE_ELFMAP=%s, "
+ "HUGETLB_FORCE_ELFMAP_EXECSTART=%#0lx, "
+ "enabling partial segment "
+ "remapping for non-relinked "
+ "binaries\n",
+ env2, force_remap);
+ DEBUG("Disabling filesz copy optimization\n");
+ minimal_copy = 0;
+ } else {
+ ERROR("LD_PRELOAD is incompatible with segment
remapping\n");
+ ERROR("Segment remapping has been DISABLED\n");
+ return -1;
+ }
}
env = getenv("HUGETLB_MINIMAL_COPY");
- if (env && (strcasecmp(env, "no") == 0)) {
+ if (minimal_copy && env && (strcasecmp(env, "no") == 0)) {
DEBUG("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy "
"optimization\n", env);
minimal_copy = 0;
@@ -846,20 +1018,13 @@ static int check_env(void)
static void __attribute__ ((constructor)) setup_elflink(void)
{
- extern Elf_Ehdr __executable_start __attribute__((weak));
- Elf_Ehdr *ehdr = &__executable_start;
- int ret, i;
-
- if (! ehdr) {
- DEBUG("Couldn't locate __executable_start, "
- "not attempting to remap segments\n");
- return;
- }
+ int i, ret;
if (check_env())
return;
- parse_elf(ehdr);
+ if (parse_elf())
+ return;
if (htlb_num_segs == 0) {
DEBUG("Executable is not linked for hugepage segments\n");
--
Nishanth Aravamudan <[EMAIL PROTECTED]>
IBM Linux Technology Center
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel