Since 2.6.27-rc1, the kernel makes reservations for mappings at mmap() time. This guarantees that the process that successfully calls mmap() will successfully fault all pages within that region. This is nice reliable behaviour but it can be the case the program wants to create a very large sparse mapping. In this case, mmap() will fail even if the program knows the huge pages are available.
This patch introduces a --no-reserve switch that uses MAP_NORESERVE. mmap() will always succeed but the fault might not. Unfortunately, on older kernels, use of MAP_NORESERVE can trigger the OOM killer. Hence, this patch also checks the kernel version and only allows use of MAP_NORESERVE if it's safe to do so. Signed-off-by: Mel Gorman <m...@csn.ul.ie> --- alloc.c | 3 ++- elflink.c | 12 +++++++++++- hugectl.c | 12 ++++++++++++ hugeutils.c | 20 ++++++++++++++++++++ init.c | 1 + kernel-features.c | 4 ++++ libhugetlbfs_internal.h | 3 +++ libhugetlbfs_privutils.h | 4 ++++ man/hugectl.8 | 10 ++++++++++ man/libhugetlbfs.7 | 13 +++++++++++++ morecore.c | 3 ++- 11 files changed, 82 insertions(+), 3 deletions(-) diff --git a/alloc.c b/alloc.c index 60a525e..a7d37e5 100644 --- a/alloc.c +++ b/alloc.c @@ -79,6 +79,7 @@ void *get_huge_pages(size_t len, ghp_t flags) void *buf; int buf_fd; int saved_error; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; /* Catch an altogether-too easy typo */ if (flags & GHR_MASK) @@ -94,7 +95,7 @@ void *get_huge_pages(size_t len, ghp_t flags) /* Map the requested region */ buf = mmap(NULL, len, PROT_READ|PROT_WRITE, - MAP_PRIVATE, buf_fd, 0); + MAP_PRIVATE|mmap_reserve, buf_fd, 0); if (buf == MAP_FAILED) { close(buf_fd); diff --git a/elflink.c b/elflink.c index 22d49a3..159edae 100644 --- a/elflink.c +++ b/elflink.c @@ -836,6 +836,7 @@ static int prepare_segment(struct seg_info *seg) unsigned long size, offset; long page_size = getpagesize(); long hpage_size; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; hpage_size = seg->page_size; @@ -869,7 +870,8 @@ static int prepare_segment(struct seg_info *seg) check_range_empty(end, new_end - end); /* Create the temporary huge page mmap */ - p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, seg->fd, 0); + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_SHARED|mmap_reserve, seg->fd, 0); if (p == MAP_FAILED) { WARNING("Couldn't map hugepage segment to copy data: %s\n", strerror(errno)); @@ -1123,6 +1125,10 @@ static void remap_segments(struct seg_info *seg, int num) mapsize = ALIGN(offset + seg[i].memsz, hpage_size); mmap_flags = MAP_PRIVATE|MAP_FIXED; + /* If requested, make no reservations */ + if (__hugetlb_opts.no_reserve) + mmap_flags |= MAP_NORESERVE; + /* * If this is a read-only mapping whose contents are * entirely contained within the file, then use MAP_NORESERVE. @@ -1244,6 +1250,10 @@ static int check_env(void) } } + INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n", + __hugetlb_opts.no_reserve ? "yes" : "no", + __hugetlb_opts.no_reserve ? "disabled" : "enabled"); + return 0; } diff --git a/hugectl.c b/hugectl.c index c968e14..e2a9828 100644 --- a/hugectl.c +++ b/hugectl.c @@ -69,6 +69,7 @@ void print_usage() OPTION("--shm", "Requests remapping of shared memory segments"); OPTION("--no-preload", "Disable preloading the libhugetlbfs library"); + OPTION("--no-reserve", "Disable huge page reservation for segments"); OPTION("--force-preload", "Force preloading the libhugetlbfs library"); OPTION("--dry-run", "describe what would be done without doing it"); @@ -161,6 +162,7 @@ void verbose_expose(void) #define LONG_BASE 0x2000 #define LONG_NO_PRELOAD (LONG_BASE | 'p') +#define LONG_NO_RESERVE (LONG_BASE | 'r') #define LONG_FORCE_PRELOAD (LONG_BASE | 'F') #define LONG_DRY_RUN (LONG_BASE | 'd') @@ -337,6 +339,7 @@ int main(int argc, char** argv) { int opt_mappings = 0; int opt_preload = 1; + int opt_no_reserve = 0; int opt_share = 0; char *opt_library = NULL; @@ -346,6 +349,7 @@ int main(int argc, char** argv) {"help", no_argument, NULL, 'h'}, {"verbose", required_argument, NULL, 'v' }, {"no-preload", no_argument, NULL, LONG_NO_PRELOAD}, + {"no-reserve", no_argument, NULL, LONG_NO_RESERVE}, {"force-preload", no_argument, NULL, LONG_FORCE_PRELOAD}, {"dry-run", no_argument, NULL, LONG_DRY_RUN}, @@ -399,6 +403,11 @@ int main(int argc, char** argv) INFO("LD_PRELOAD disabled\n"); break; + case LONG_NO_RESERVE: + opt_no_reserve = 1; + INFO("MAP_NORESERVE used for huge page mappings\n"); + break; + case LONG_FORCE_PRELOAD: opt_preload = 1; opt_force_preload = 1; @@ -449,6 +458,9 @@ int main(int argc, char** argv) if (opt_preload) ldpreload(opt_mappings); + if (opt_no_reserve) + setup_environment("HUGETLB_NO_RESERVE", "yes"); + if (opt_share) setup_environment("HUGETLB_SHARE", "1"); diff --git a/hugeutils.c b/hugeutils.c index bfa4512..1e35597 100644 --- a/hugeutils.c +++ b/hugeutils.c @@ -300,6 +300,11 @@ void hugetlbfs_setup_env() env = getenv("HUGETLB_SHM"); if (env && !strcmp(env, "yes")) __hugetlb_opts.shm_enabled = 1; + + /* Determine if all reservations should be avoided */ + env = getenv("HUGETLB_NO_RESERVE"); + if (env && !strcmp(env, "yes")) + __hugetlb_opts.no_reserve = 1; } void hugetlbfs_check_priv_resv() @@ -316,6 +321,21 @@ void hugetlbfs_check_priv_resv() } } +void hugetlbfs_check_safe_noreserve() +{ + /* + * Some kernels will trigger an OOM if MAP_NORESERVE is used and + * a huge page allocation fails. This is unfortunate so limit + * the user of NORESERVE where necessary + */ + if (__hugetlb_opts.no_reserve && + !hugetlbfs_test_feature(HUGETLB_FEATURE_SAFE_NORESERVE)) { + INFO("Kernel is not safe for MAP_NORESERVE. Forcing " + "use of reservations.\n"); + __hugetlb_opts.no_reserve = 0; + } +} + /* * Pool counters are typically exposed in sysfs in modern kernels, the * counters for the default page size are exposed in procfs in all kernels diff --git a/init.c b/init.c index e95cb5c..049c9e2 100644 --- a/init.c +++ b/init.c @@ -26,6 +26,7 @@ static void __attribute__ ((constructor)) setup_libhugetlbfs(void) setup_mounts(); setup_features(); hugetlbfs_check_priv_resv(); + hugetlbfs_check_safe_noreserve(); #ifndef NO_ELFLINK hugetlbfs_setup_elflink(); #endif diff --git a/kernel-features.c b/kernel-features.c index 4bb0149..68bc8f9 100644 --- a/kernel-features.c +++ b/kernel-features.c @@ -41,6 +41,10 @@ static struct feature kernel_features[] = { .name = "private_reservations", .required_version = "2.6.27-rc1", }, + [HUGETLB_FEATURE_SAFE_NORESERVE] = { + .name = "noreserve_safe", + .required_version = "2.6.35", + } }; static void debug_kernel_version(void) diff --git a/libhugetlbfs_internal.h b/libhugetlbfs_internal.h index 8b1709c..6e9379e 100644 --- a/libhugetlbfs_internal.h +++ b/libhugetlbfs_internal.h @@ -60,6 +60,7 @@ struct libhugeopts_t { int sharing; int shrink_ok; int shm_enabled; + int no_reserve; unsigned long force_elfmap; char *ld_preload; char *elfmap; @@ -100,6 +101,8 @@ extern void setup_mounts(); extern void setup_features(); #define hugetlbfs_check_priv_resv __lh_hugetlbfs_check_priv_resv extern void hugetlbfs_check_priv_resv(); +#define hugetlbfs_check_safe_noreserve __lh_hugetlbfs_check_safe_noreserve +extern void hugetlbfs_check_safe_noreserve(); #define __hugetlbfs_hostname __lh___hugetlbfs_hostname extern char __hugetlbfs_hostname[]; #define hugetlbfs_prefault __lh_hugetlbfs_prefault diff --git a/libhugetlbfs_privutils.h b/libhugetlbfs_privutils.h index 730c939..18bcedb 100644 --- a/libhugetlbfs_privutils.h +++ b/libhugetlbfs_privutils.h @@ -75,6 +75,10 @@ void restore_overcommit_pages(long page_size, long oc_pool); enum { /* Reservations are created for private mappings */ HUGETLB_FEATURE_PRIVATE_RESV, + + /* Whether use of MAP_NORESERVE is safe or can result in OOM */ + HUGETLB_FEATURE_SAFE_NORESERVE, + HUGETLB_FEATURE_NR, }; #define hugetlbfs_test_feature __pu_hugetlbfs_test_feature diff --git a/man/hugectl.8 b/man/hugectl.8 index 819863e..319742e 100644 --- a/man/hugectl.8 +++ b/man/hugectl.8 @@ -82,6 +82,16 @@ where binaries are aligned to 64K as required by the ABI and the kernel is using a 4K base pagesize. .TP +.B --no-reserve +By default, huge pages are reserved at mmap() time so future faults will +succeed. This avoids unexpected application but some applications depend +on memory overcommit to create large sparse mappings. For this type of +application, this switch will create huge page backed mappings without a +reservation if the kernel is recent enough to make this operation safe. +Use this option with extreme care as in the event huge pages are not +available when the mapping is faulted, the application will be killed. + +.TP .B --dry-run Instead of running the process, the \fBhugectl\fP utility will describe what environment variables it set for \fBlibhugetlbfs\fP. This is useful if diff --git a/man/libhugetlbfs.7 b/man/libhugetlbfs.7 index 0d63734..a8de5d2 100644 --- a/man/libhugetlbfs.7 +++ b/man/libhugetlbfs.7 @@ -119,6 +119,19 @@ the hugepage pool is large enough to run the application or the kernel is 2.6.27 or later, this environment variable should be set. .TP +.B HUGETLB_NO_RESERVE=yes + +By default, the kernel will reserve huge pages at mmap() time to ensure that +future faults will succeed. This avoids unexpected application failure at +fault time but some applications depend on memory overcommit to create +large sparse mappings. For this type of application, setting this environment +variable will create huge page backed mappings without a reservation. Use +this option with extreme care as in the event huge pages are not available +when the mapping is used, the application will be killed. On older kernels, +the use of this feature can trigger the OOM killer. Hence, even with this +variable set, reservations may still be used for safety. + +.TP .B HUGETLB_MORECORE_HEAPBASE=address \fBlibhugetlbfs\fP normally picks an address to use as the base of the heap for malloc() automatically. This environment variable fixes which address is used. diff --git a/morecore.c b/morecore.c index 232e04a..869fcbd 100644 --- a/morecore.c +++ b/morecore.c @@ -72,6 +72,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) int ret; void *p; long delta; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; INFO("hugetlbfs_morecore(%ld) = ...\n", (long)increment); @@ -94,7 +95,7 @@ static void *hugetlbfs_morecore(ptrdiff_t increment) /* map in (extend) more of the file at the end of our last map */ p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, - MAP_PRIVATE, heap_fd, mapsize); + MAP_PRIVATE|mmap_reserve, heap_fd, mapsize); if (p == MAP_FAILED) { WARNING("New heap segment map at %p failed: %s\n", heapbase+mapsize, strerror(errno)); ------------------------------------------------------------------------------ _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel