On Fri, 2008-11-07 at 16:09 +0000, Mel Gorman wrote: > The get_huge_pages() API is a close-to-kernel interface for the direct > allocation of hugepages. This forces the caller to deal with alignment and > fallback to base pages where suitable. For the casual user of hugepages > that does not care for such things, this patch adds get_hugepage_region(). > It allocates regions of memory that are backed by hugepages where possible > but callers are not required to align their length and can request fallback > to base pages. > > Signed-off-by: Mel Gorman <[EMAIL PROTECTED]>
Acked-by: Adam Litke <[EMAIL PROTECTED]> > --- > Makefile | 5 +- > alloc.c | 58 +++++++++++++++++++- > hugetlbfs.h | 23 +++++++- > man/get_huge_pages.3 | 2 + > man/get_hugepage_region.3 | 79 ++++++++++++++++++++++++++ > tests/Makefile | 4 +- > tests/get_hugepage_region.c | 129 > +++++++++++++++++++++++++++++++++++++++++++ > tests/run_tests.sh | 3 + > version.lds | 2 + > 9 files changed, 300 insertions(+), 5 deletions(-) > create mode 100644 man/get_hugepage_region.3 > create mode 100644 tests/get_hugepage_region.c > > diff --git a/Makefile b/Makefile > index 4554154..40c8c45 100644 > --- a/Makefile > +++ b/Makefile > @@ -8,7 +8,8 @@ BIN_OBJ_DIR=obj > INSTALL_BIN = hugectl hugeedit hugeadm pagesize > INSTALL_HEADERS = hugetlbfs.h > INSTALL_MAN1 = pagesize.1 > -INSTALL_MAN3 = get_huge_pages.3 gethugepagesizes.3 getpagesizes.3 > +INSTALL_MAN3 = get_huge_pages.3 get_hugepage_region.3 \ > + gethugepagesizes.3 getpagesizes.3 > INSTALL_MAN7 = libhugetlbfs.7 > INSTALL_MAN8 = hugectl.8 hugeedit.8 hugeadm.8 > LDSCRIPT_TYPES = B BDT > @@ -379,7 +380,9 @@ install-man: > gzip -f $(DESTDIR)$(MANDIR3)/$$x; \ > done > rm -f $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz > + rm -f $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz > ln -s get_huge_pages.3.gz $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz > + ln -s get_hugepage_region.3.gz > $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz > for x in $(INSTALL_MAN7); do \ > $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR7); \ > gzip -f $(DESTDIR)$(MANDIR7)/$$x; \ > diff --git a/alloc.c b/alloc.c > index 6e026c5..74bb5a4 100644 > --- a/alloc.c > +++ b/alloc.c > @@ -35,7 +35,7 @@ static void *fallback_base_pages(size_t len, ghp_t flags) > { > int fd; > void *buf; > - DEBUG("get_huge_pages: Falling back to base pages\n"); > + DEBUG("get_hugepage_region: Falling back to base pages\n"); > > /* > * Map /dev/zero instead of MAP_ANONYMOUS avoid VMA mergings. Freeing > @@ -78,6 +78,10 @@ void *get_huge_pages(size_t len, ghp_t flags) > void *buf; > int heap_fd; > > + /* Catch an altogether-too easy typo */ > + if (flags & GHR_MASK) > + ERROR("Improper use of GHR_* in get_huge_pages()\n"); > + > /* Create a file descriptor for the new region */ > heap_fd = hugetlbfs_unlinked_fd(); > if (heap_fd < 0) { > @@ -174,3 +178,55 @@ void free_huge_pages(void *ptr) > > fclose(fd); > } > + > +/** > + * get_hugepage_region - Allocate an amount of memory backed by huge pages > + * > + * len: Size of the region to allocate > + * flags: Flags specifying the behaviour of the function > + * > + * This function allocates a region of memory backed by huge pages. Care > should > + * be taken when using this function as a drop-in replacement for malloc() as > + * memory can be wasted if the length is not hugepage-aligned. This function > + * is more relaxed than get_huge_pages() in that it allows fallback to small > + * pages when requested. > + */ > +void *get_hugepage_region(size_t len, ghr_t flags) > +{ > + size_t aligned_len, wastage; > + void *buf; > + > + /* Catch an altogether-too easy typo */ > + if (flags & GHP_MASK) > + ERROR("Improper use of GHP_* in get_hugepage_region()\n"); > + > + /* Align the len parameter to a hugepage boundary and allocate */ > + aligned_len = ALIGN(len, gethugepagesize()); > + buf = get_huge_pages(aligned_len, GHP_DEFAULT); > + if (buf == NULL && (flags & GHR_FALLBACK)) { > + aligned_len = ALIGN(len, getpagesize()); > + buf = fallback_base_pages(len, flags); > + } > + > + /* Calculate wastage */ > + wastage = aligned_len - len; > + if (wastage != 0) > + DEBUG("get_hugepage_region: Wasted %zd bytes due to > alignment\n", > + wastage); > + > + return buf; > +} > + > +/** > + * free_hugepage_region - Free a region allocated by get_hugepage_region > + * ptr - The pointer to the buffer returned by get_hugepage_region > + * > + * This function finds a region to free based on the contents of > + * /proc/pid/maps. The assumption is made that the ptr is the start of > + * a hugepage region allocated with get_hugepage_region. No checking is made > + * that the pointer is to a hugepage backed region. > + */ > +void free_hugepage_region(void *ptr) > +{ > + free_huge_pages(ptr); > +} > diff --git a/hugetlbfs.h b/hugetlbfs.h > index 0efa02c..ebb676c 100644 > --- a/hugetlbfs.h > +++ b/hugetlbfs.h > @@ -46,9 +46,30 @@ int hugetlbfs_unlinked_fd_for_size(long page_size); > */ > typedef unsigned long ghp_t; > #define GHP_DEFAULT ((ghp_t)0x01UL) > +#define GHP_MASK (GHP_DEFAULT) > > -/* Direct alloc functions */ > +/* Direct alloc functions for hugepages */ > void *get_huge_pages(size_t len, ghp_t flags); > void free_huge_pages(void *ptr); > > +/* > + * Region alloc flags and types > + * > + * GHR_DEFAULT - Use a combination of flags deemed to be a sensible default > + * by the current implementation of the library > + * GHR_FALLBACK - Use the default hugepage size if possible but fallback to > + * smaller pages if necessary > + * GHR_STRICT - Use hugepages of some size or return NULL > + */ > +typedef unsigned long ghr_t; > +#define GHR_STRICT ((ghr_t)0x10000000U) > +#define GHR_FALLBACK ((ghr_t)0x20000000U) > +#define GHR_DEFAULT GHR_FALLBACK > + > +#define GHR_MASK (GHR_FALLBACK|GHR_STRICT) > + > +/* Allocation functions for regions backed by hugepages */ > +void *get_hugepage_region(size_t len, ghr_t flags); > +void free_hugepage_region(void *ptr); > + > #endif /* _HUGETLBFS_H */ > diff --git a/man/get_huge_pages.3 b/man/get_huge_pages.3 > index f2a33a4..af95a82 100644 > --- a/man/get_huge_pages.3 > +++ b/man/get_huge_pages.3 > @@ -64,6 +64,8 @@ mmap() was due to. > , > .I gethugepagesize(3) > , > +.I get_hugepage_region(3) > +, > .I libhugetlbfs(7) > .SH AUTHORS > libhugetlbfs was written by various people on the libhugetlbfs-devel > diff --git a/man/get_hugepage_region.3 b/man/get_hugepage_region.3 > new file mode 100644 > index 0000000..ce0b018 > --- /dev/null > +++ b/man/get_hugepage_region.3 > @@ -0,0 +1,79 @@ > +.\" Hey, EMACS: -*- nroff -*- > +.\" First parameter, NAME, should be all caps > +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection > +.\" other parameters are allowed: see man(7), man(1) > +.TH GET_HUGEPAGE_REGION 3 "November 7, 2008" > +.\" Please adjust this date whenever revising the manpage. > +.\" > +.\" Some roff macros, for reference: > +.\" .nh disable hyphenation > +.\" .hy enable hyphenation > +.\" .ad l left justify > +.\" .ad b justify to both left and right margins > +.\" .nf disable filling > +.\" .fi enable filling > +.\" .br insert line break > +.\" .sp <n> insert n+1 empty lines > +.\" for manpage-specific macros, see man(7) > +.SH NAME > +get_hugepage_region, free_hugepage_region \- Allocate and free regions of > memory that use hugepages where possible > +.SH SYNOPSIS > +.B #include <hugetlbfs.h> > +.br > + > +.br > +.B void *get_hugepage_region(size_t len, ghr_t flags); > +.br > +.B void free_hugepage_region(void *ptr); > +.SH DESCRIPTION > + > +\fBget_hugepage_region()\fP allocates a memory region \fBlen\fP bytes in size > +backed by hugepages. Hugepages may be of benefit to applications that use > +large amounts of address space and suffer a performance hit due to TLB > +misses. Wall-clock time or oprofile can be used to determine if there is > +a performance benefit from using hugepages or not. > + > +Unlike \fBget_huge_pages()\fB, \fBlen\fP does not have to be hugepage-aligned > +although memory may be wasted due to alignment. The caller may also specify > +that base pages be used in the event there are no hugepages available. > + > +The \fBflags\fP argument changes the behaviour of the function. Flags may > +be or'd together. > + > +.TP > +.B GHR_FALLBACK > +Use base pages if there are an insufficient number of huge pages. > + > +.B GHR_STRICT > +Use hugepages or return NULL. > + > +.B GHR_DEFAULT > + > +The library chooses a sensible combination of flags for allocating a region > of > +memory. The current default is: > + GHR_FALLBACK > + > +.PP > + > +\fBfree_hugepage_region()\fP frees a region of memory allocated by > +\fBget_hugepage_region()\fP. The behaviour of the function if another > +pointer is used, valid or otherwise, is undefined. > + > +.SH RETURN VALUE > + > +On success, a pointer is returned for to the allocated memory. On > +error, NULL is returned. errno will be set based on what the failure of > +mmap() was due to. > + > +.SH SEE ALSO > +.I oprofile(1) > +, > +.I gethugepagesize(3) > +, > +.I get_huge_pages(3) > +, > +.I libhugetlbfs(7) > +.SH AUTHORS > +libhugetlbfs was written by various people on the libhugetlbfs-devel > +mailing list. > + > diff --git a/tests/Makefile b/tests/Makefile > index 009f75f..4313084 100644 > --- a/tests/Makefile > +++ b/tests/Makefile > @@ -7,8 +7,8 @@ LIB_TESTS = gethugepagesize test_root find_path unlinked_fd > misalign \ > truncate_reserve_wraparound truncate_sigbus_versus_oom \ > map_high_truncate_2 truncate_above_4GB direct \ > misaligned_offset brk_near_huge task-size-overrun stack_grow_into_huge \ > - counters quota heap-overflow get_huge_pages shmoverride_linked \ > - gethugepagesizes > + counters quota heap-overflow get_huge_pages get_hugepage_region \ > + shmoverride_linked gethugepagesizes > LIB_TESTS_64 = straddle_4GB huge_at_4GB_normal_below \ > huge_below_4GB_normal_above > NOLIB_TESTS = malloc malloc_manysmall dummy heapshrink shmoverride_unlinked > diff --git a/tests/get_hugepage_region.c b/tests/get_hugepage_region.c > new file mode 100644 > index 0000000..81428e4 > --- /dev/null > +++ b/tests/get_hugepage_region.c > @@ -0,0 +1,129 @@ > +/* > + * libhugetlbfs - Easy use of Linux hugepages > + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public License > + * as published by the Free Software Foundation; either version 2.1 of > + * the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA > + */ > +#include <stdio.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <sys/mman.h> > + > +#include <hugetlbfs.h> > + > +#include "hugetests.h" > + > +long hpage_size; > +long oc_hugepages = -1; > + > +/* Restore nr_overcommit_hugepages */ > +void cleanup(void) > +{ > + if (oc_hugepages != -1) > + set_nr_overcommit_hugepages(hpage_size, oc_hugepages); > +} > + > +/* Confirm a region really frees, only really important for GHR_FALLBACK */ > +void free_and_confirm_region_free(void *p, int line) { > + unsigned char vec = 0; > + free_hugepage_region(p); > + if (mincore(p, 4, &vec) == 0 || vec) > + FAIL("free_hugepage_region did not free region at line %d", > line); > +} > + > +#define TESTLEN ((num_hugepages - 1) * hpage_size + hpage_size / 2) > + > +void test_GHR_STRICT(int num_hugepages) > +{ > + int err; > + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); > + if (p == NULL) > + FAIL("get_hugepage_region() for %d hugepages", num_hugepages); > + > + memset(p, 1, TESTLEN); > + > + err = test_addr_huge(p + (num_hugepages - 1) * hpage_size); > + if (err != 1) > + FAIL("Returned page is not hugepage"); > + > + free_and_confirm_region_free(p, __LINE__); > + err = test_addr_huge(p); > + if (err == 1) > + FAIL("hugepage was not correctly freed"); > +} > + > +void test_GHR_FALLBACK(void) > +{ > + int err; > + long rsvd_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); > + long num_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL) > + - rsvd_hugepages; > + > + /* We must disable overcommitted huge pages to test this */ > + oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC); > + set_nr_overcommit_hugepages(hpage_size, 0); > + > + /* We should be able to allocate the whole pool */ > + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); > + if (p == NULL) > + FAIL("test_GHR_FALLBACK(GHR_DEFAULT) failed for %ld hugepages", > + num_hugepages); > + memset(p, 1, TESTLEN); > + err = test_addr_huge(p + (num_hugepages - 1) * hpage_size); > + if (err != 1) > + FAIL("Returned page is not hugepage"); > + free_and_confirm_region_free(p, __LINE__); > + > + /* We should fail allocating too much */ > + num_hugepages++; > + p = get_hugepage_region(TESTLEN, GHR_STRICT); > + if (p != NULL) > + FAIL("test_GHR_FALLBACK() for %ld expected fail, got success", > num_hugepages); > + > + /* GHR_FALLBACK should succeed by allocating base pages */ > + p = get_hugepage_region(TESTLEN, GHR_FALLBACK); > + if (p == NULL) > + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) failed for %ld hugepages", > + num_hugepages); > + memset(p, 1, TESTLEN); > + err = test_addr_huge(p + (num_hugepages - 1) * hpage_size); > + if (err == 1) > + FAIL("Returned page is not a base page"); > + > + /* > + * We allocate a second fallback region to see can they be told apart > + * on free. Merging VMAs would cause problems > + */ > + void *pb = get_hugepage_region(TESTLEN, GHR_FALLBACK); > + if (pb == NULL) > + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) x2 failed for %ld > hugepages", > + num_hugepages); > + memset(pb, 1, TESTLEN); > + > + free_and_confirm_region_free(pb, __LINE__); > + free_and_confirm_region_free(p, __LINE__); > +} > + > +int main(int argc, char *argv[]) > +{ > + test_init(argc, argv); > + hpage_size = gethugepagesize(); > + check_free_huge_pages(4); > + test_GHR_STRICT(1); > + test_GHR_STRICT(4); > + test_GHR_FALLBACK(); > + > + PASS(); > +} > diff --git a/tests/run_tests.sh b/tests/run_tests.sh > index 9064451..f163b11 100755 > --- a/tests/run_tests.sh > +++ b/tests/run_tests.sh > @@ -341,6 +341,9 @@ check_linkhuge_tests > # Test direct allocation API > run_test get_huge_pages > > +# Test hugepage-backed region API > + run_test get_hugepage_region > + > # Test overriding of shmget() > run_test shmoverride_linked > run_test LD_PRELOAD=libhugetlbfs.so shmoverride_unlinked > diff --git a/version.lds b/version.lds > index 86cc6b7..e76b8f7 100644 > --- a/version.lds > +++ b/version.lds > @@ -18,6 +18,8 @@ HTLBFS_2.0 { > > HTLBFS_2.1 { > global: > + get_hugepage_region; > + free_hugepage_region; > gethugepagesizes; > getpagesizes; > hugetlbfs_find_path_for_size; -- Adam Litke - (agl at us.ibm.com) IBM Linux Technology Center ------------------------------------------------------------------------- This SF.Net email is sponsored by the Moblin Your Move Developer's challenge Build the coolest Linux based applications with Moblin SDK & win great prizes Grand prize is a trip for two to an Open Source event anywhere in the world http://moblin-contest.org/redirect.php?banner_id=100&url=/ _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel