Just a few minor comments below, but otherwise
Acked-by: Steve Fox <[EMAIL PROTECTED]>
I'll post some numbers for Stream comparisons with relinked apps
tomorrow.
On Tue, 2007-06-26 at 09:41 -0700, Nishanth Aravamudan wrote:
> diff --git a/HOWTO b/HOWTO
> index 8c13d46..ee96c69 100644
> --- a/HOWTO
> +++ b/HOWTO
> @@ -310,6 +310,10 @@ libhugetlbfs:
> otherwise, only the necessary parts will be, which can
> be much more efficient (default)
>
> + HUGETLB_FORCE_ELFMAP
> + HUGETLB_FORCE_ELFMAP_EXECSTART
> + Explained in "Partial segment remapping"
> +
> HUGETLB_MORECORE
> HUGETLB_MORECORE_HEAPBASE
> Explained in "Using hugepages for malloc()
> @@ -373,6 +377,33 @@ manually deleted to free the hugepages in question.
> Future versions
> of libhugetlbfs should include tools and scripts to automate this
> cleanup.
>
> + Partial segment remapping
> + -------------------------
> +
> +libhugetlbfs has limited support for remapping a normal, non-relinked
> +binary's data, text and BSS into hugepages. To enable this feature,
> +HUGETLB_FORCE_ELFMAP must be set to "yes", and
> +HUGETLB_FORCE_ELFMAP_EXECSTART must contain the address at which
> +__executable_start can be found in the binary. This is typically the
> +same address for all binaries on the system and can often be found by
> +examining the system linker scripts in /usr/lib/ldscripts.
For some reason Red Hat doesn't ship their scripts there. Instead, you
must run `ld --verbose` to view the default linker script. Might be
worth mentioning.
> diff --git a/elflink.c b/elflink.c
> index 3eb61c1..1591c0f 100644
> --- a/elflink.c
> +++ b/elflink.c
> @@ -160,6 +160,7 @@ static struct seg_info htlb_seg_table[MAX_HTLB_SEGS];
> static int htlb_num_segs;
> static int minimal_copy = 1;
> static int sharing; /* =0 */
> +static unsigned long force_remap; /* =0 */
> int __debug = 0;
>
> /**
> @@ -492,11 +493,7 @@ bail2:
> seg->extrasz = end_orig - start_orig;
> }
>
> -/*
> - * Parse an ELF header and record segment information for any segments
> - * which contain hugetlb information.
> - */
> -static void parse_elf(Elf_Ehdr *ehdr)
> +static void parse_elf_relinked(Elf_Ehdr *ehdr)
> {
> Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
> int i;
> @@ -544,6 +541,138 @@ static void parse_elf(Elf_Ehdr *ehdr)
> }
> }
>
> +#define ALIGN_UP(x,a) (((x) + (a)) & ~((a) - 1))
> +#define ALIGN_DOWN(x,a) ((x) & ~((a) - 1))
> +
> +#if defined(__powerpc64__) || defined (__powerpc__)
> +#define SLICE_LOW_SHIFT 28
> +#define SLICE_HIGH_SHIFT 40
> +
> +#define SLICE_LOW_TOP (0x100000000UL)
> +#define SLICE_LOW_SIZE (1UL << SLICE_LOW_SHIFT)
> +#define SLICE_HIGH_SIZE (1UL << SLICE_HIGH_SHIFT)
> +#endif
> +
> +/*
> + * Return the address of the start and end of the hugetlb slice
> + * containing @addr. A slice is a range of addresses, start inclusive
> + * and end exclusive.
> + * Note, that since relinking is not supported on ia64, we can leave it
> + * out here.
> + */
> +static unsigned long hugetlb_slice_start(unsigned long addr)
> +{
> +#if defined(__powerpc64__)
> + if (addr < SLICE_LOW_TOP)
> + return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
> + else if (addr < SLICE_HIGH_SIZE)
> + return SLICE_LOW_TOP;
> + else
> + return ALIGN_DOWN(addr, SLICE_HIGH_SIZE);
> +#elif defined(__powerpc__)
> + return ALIGN_DOWN(addr, SLICE_LOW_SIZE);
> +#else
> + return ALIGN_DOWN(addr, gethugepagesize());
> +#endif
> +}
> +
> +static unsigned long hugetlb_slice_end(unsigned long addr)
> +{
> +#if defined(__powerpc64__)
> + if (addr < SLICE_LOW_TOP)
> + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
> + else
> + return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1;
> +#elif defined(__powerpc__)
> + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1;
> +#else
> + return ALIGN_UP(addr, gethugepagesize()) - 1;
> +#endif
> +}
> +
> +static unsigned long hugetlb_next_slice_start(unsigned long addr)
> +{
> + return hugetlb_slice_end(addr) + 1;
> +}
> +
> +static unsigned long hugetlb_prev_slice_end(unsigned long addr)
> +{
> + return hugetlb_slice_start(addr) - 1;
> +}
> +
> +static void parse_elf_normal(Elf_Ehdr *ehdr)
> +{
> + Elf_Phdr *phdr = (Elf_Phdr *)((char *)ehdr + ehdr->e_phoff);
> + int i;
> +
> + for (i = 0; i < ehdr->e_phnum && htlb_num_segs < MAX_HTLB_SEGS; i++) {
> + unsigned long vaddr, filesz, memsz, gap;
> + unsigned long slice_end;
> + int prot = 0;
Nit pick. Should these be declared at the top of the func? I was
thinking we liked to keep things kernel-style.
> +
> + if (phdr[i].p_type != PT_LOAD)
> + continue;
> +
> + /*
> + * Partial segment remapping only makes sense if the
> + * memory size of the segment is larger than the
> + * granularity at which hugepages can be used. This
> + * mostly affects ppc, where the segment must be larger
> + * than 256M. This guarantees that remapping the binary
> + * in this forced way won't violate any contiguity
> + * constraints.
> + */
> + vaddr = hugetlb_next_slice_start(phdr[i].p_vaddr);
> + gap = vaddr - phdr[i].p_vaddr;
> + slice_end = hugetlb_slice_end(vaddr);
> + /*
> + * we should stop remapping just before the slice
> + * containing the end of the memsz portion (taking away
> + * the gap of the memsz)
> + */
> + memsz = phdr[i].p_memsz;
> + if (memsz < gap) {
> + DEBUG("Segment %d's unaligned memsz is too small: "
> + "%#0lx < %#0lx\n",
> + i, memsz, gap);
> + continue;
> + }
> + memsz -= gap;
> + if (memsz < (slice_end - vaddr)) {
> + DEBUG("Segment %d's aligned memsz is too small: "
> + "%#0lx < %#0lx\n",
> + i, memsz, slice_end - vaddr);
> + continue;
> + }
> + memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr;
> +
> + /*
> + * minimal_copy is disabled so just set filesz to memsz,
> + * to avoid issues in prepare
> + */
> + filesz = memsz;
> +
> + if (phdr[i].p_flags & PF_R)
> + prot |= PROT_READ;
> + if (phdr[i].p_flags & PF_W)
> + prot |= PROT_WRITE;
> + if (phdr[i].p_flags & PF_X)
> + prot |= PROT_EXEC;
> +
> + DEBUG("Hugepage segment %d (phdr %d): %#0lx-%#0lx "
> + "(filesz=%#0lx) " "(prot = %#0x)\n",
> + htlb_num_segs, i, vaddr, vaddr+memsz,
> + filesz, prot);
> +
> + htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr;
> + htlb_seg_table[htlb_num_segs].filesz = filesz;
> + htlb_seg_table[htlb_num_segs].memsz = memsz;
> + htlb_seg_table[htlb_num_segs].prot = prot;
> + htlb_seg_table[htlb_num_segs].index = i;
> + htlb_num_segs++;
> + }
> +}
> +
> /*
> * Copy a program segment into a huge page. If possible, try to copy the
> * smallest amount of data possible, unless the user disables this
> @@ -551,7 +680,7 @@ static void parse_elf(Elf_Ehdr *ehdr)
> */
> static int prepare_segment(struct seg_info *seg)
> {
> - long hpage_size = gethugepagesize();
> + int hpage_size = gethugepagesize();
> void *p;
> unsigned long gap;
> unsigned long size;
> @@ -802,9 +931,18 @@ static void remap_segments(struct seg_info *seg, int num)
> */
> }
>
> +static int is_valid_elf(Elf_Ehdr *ehdr)
> +{
> + return ehdr->e_ident[EI_MAG0] == 0x7f &&
> + ehdr->e_ident[EI_MAG1] == 'E' &&
> + ehdr->e_ident[EI_MAG2] == 'L' &&
> + ehdr->e_ident[EI_MAG3] == 'F';
> +}
> +
> static int check_env(void)
> {
> char *env;
> + extern Elf_Ehdr __executable_start __attribute__((weak));
>
> env = getenv("HUGETLB_ELFMAP");
> if (env && (strcasecmp(env, "no") == 0)) {
> @@ -815,13 +953,49 @@ static int check_env(void)
>
> env = getenv("LD_PRELOAD");
> if (env && strstr(env, "libhugetlbfs")) {
> - ERROR("LD_PRELOAD is incompatible with segment remapping\n");
> - ERROR("Segment remapping has been DISABLED\n");
> - return -1;
> + char *env2, *env3, *ep;
Same nit pick as above.
> + env2 = getenv("HUGETLB_FORCE_ELFMAP");
> + if (env2 && (strcasecmp(env2, "yes") == 0)) {
> + env3 = getenv("HUGETLB_FORCE_ELFMAP_EXECSTART");
> + if (!env3) {
> + ERROR("HUGETLB_FORCE_ELFMAP=%s, but "
> + "HUGETLB_FORCE_ELFMAP_EXECSTART
> "
> + "is unset\n", env2);
> + return -1;
> + }
> + force_remap = strtoul(env3, &ep, 16);
> + if (*ep != '\0') {
> + ERROR("Can't parse
> HUGETLB_FORCE_ELFMAP_EXECSTART: "
> + "%s\n", strerror(errno));
> + return -1;
> + }
> + if (!is_valid_elf((Elf_Ehdr *)force_remap)) {
> + DEBUG("The address passed in "
> + "HUGETLB_FORCE_ELFMAP_EXECSTART
> "
> + "(%#0lx) is not the "
> + "location of a valid ELF "
> + "header\n", force_remap);
> + return -1;
> + }
> + DEBUG("HUGETLB_FORCE_ELFMAP=%s, "
> + "HUGETLB_FORCE_ELFMAP_EXECSTART=%#0lx, "
> + "enabling partial segment "
> + "remapping for non-relinked "
> + "binaries\n",
> + env2, force_remap);
> + DEBUG("Disabling filesz copy optimization\n");
> + minimal_copy = 0;
> + } else {
> + if (&__executable_start) {
> + ERROR("LD_PRELOAD is incompatible with segment
> remapping\n");
> + ERROR("Segment remapping has been DISABLED\n");
> + return -1;
> + }
> + }
> }
>
> env = getenv("HUGETLB_MINIMAL_COPY");
> - if (env && (strcasecmp(env, "no") == 0)) {
> + if (minimal_copy && env && (strcasecmp(env, "no") == 0)) {
> DEBUG("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy "
> "optimization\n", env);
> minimal_copy = 0;
> @@ -854,29 +1028,51 @@ static int check_env(void)
> return 0;
> }
>
> -static void __attribute__ ((constructor)) setup_elflink(void)
> +/*
> + * Parse an ELF header and record segment information for any segments
> + * which contain hugetlb information.
> + */
> +static int parse_elf()
> {
> extern Elf_Ehdr __executable_start __attribute__((weak));
> - Elf_Ehdr *ehdr = &__executable_start;
> - int ret, i;
>
> - if (! ehdr) {
> - DEBUG("Couldn't locate __executable_start, "
> - "not attempting to remap segments\n");
> - return;
> + /* a normal, not relinked binary */
> + if (! (&__executable_start)) {
> + if (force_remap) {
> + parse_elf_normal((Elf_Ehdr *)force_remap);
> + if (htlb_num_segs == 0) {
> + DEBUG("No segments were appropriate for "
> + "partial remapping\n");
> + return -1;
> + }
> + } else {
> + DEBUG("Couldn't locate __executable_start, "
> + "not attempting to remap segments\n");
> + return -1;
> + }
> + } else {
> + parse_elf_relinked(&__executable_start);
> + if (htlb_num_segs == 0) {
> + DEBUG("No segments were appropriate for partial "
> + "remapping\n");
Should 'partial' be removed? If it's a relinked binary, we should do
'full' remapping, right?
> + return -1;
> + }
> }
>
> - if (check_env())
> - return;
> + return 0;
> +}
>
> - DEBUG("libhugetlbfs version: %s\n", VERSION);
> +static void __attribute__ ((constructor)) setup_elflink(void)
> +{
> + int i, ret;
>
> - parse_elf(ehdr);
> + if (check_env())
> + return;
>
> - if (htlb_num_segs == 0) {
> - DEBUG("Executable is not linked for hugepage segments\n");
> + if (parse_elf())
> return;
> - }
> +
> + DEBUG("libhugetlbfs version: %s\n", VERSION);
>
> /* Do we need to find a share directory */
> if (sharing) {
--
Steve Fox
IBM Linux Technology Center
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel