The kernel (correctly) returns EINVAL if you attempt a hugepage mapping with a file offset which is not hugepage aligned. However, current kernels have a bug in the corresponding backout path, which attempts to use the normal unmap_region() on the (half-constructed) hugepage VMA. On ppc64, where the area could be in a region already set aside for hugepages, this can cause bad_pud() to be triggered with consequent nastiness.
This patch adds a testcase to libhugetlbfs to catch this bug. Signed-off-by: David Gibson <[EMAIL PROTECTED]> Index: libhugetlbfs/tests/Makefile =================================================================== --- libhugetlbfs.orig/tests/Makefile 2006-11-13 14:46:04.000000000 +1100 +++ libhugetlbfs/tests/Makefile 2006-11-13 14:46:18.000000000 +1100 @@ -5,7 +5,8 @@ LIB_TESTS = gethugepagesize test_root fi ptrace-write-hugepage icache-hygeine slbpacaflush \ chunk-overcommit mprotect alloc-instantiate-race mlock \ truncate_reserve_wraparound truncate_sigbus_versus_oom \ - map_high_truncate_2 truncate_above_4GB + map_high_truncate_2 truncate_above_4GB \ + misaligned_offset LIB_TESTS_64 = straddle_4GB huge_at_4GB_normal_below \ huge_below_4GB_normal_above NOLIB_TESTS = malloc malloc_manysmall dummy Index: libhugetlbfs/tests/run_tests.sh =================================================================== --- libhugetlbfs.orig/tests/run_tests.sh 2006-11-13 14:46:04.000000000 +1100 +++ libhugetlbfs/tests/run_tests.sh 2006-11-13 14:46:18.000000000 +1100 @@ -125,6 +125,7 @@ functional_tests () { run_test_bits 64 huge_at_4GB_normal_below run_test_bits 64 huge_below_4GB_normal_above run_test map_high_truncate_2 + run_test misaligned_offset run_test truncate_above_4GB # Tests requiring an active mount and hugepage COW Index: libhugetlbfs/tests/misaligned_offset.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ libhugetlbfs/tests/misaligned_offset.c 2006-11-13 15:26:46.000000000 +1100 @@ -0,0 +1,153 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Hugh Dickins <[EMAIL PROTECTED]> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, truncates at or above 4GB + * could truncate lower pages, resulting in BUG_ON()s. + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ + +static unsigned long long read_free(void) +{ + FILE *f; + unsigned long long count; + int ret; + + f = popen("grep HugePages_Free /proc/meminfo", "r"); + if (!f || ferror(f)) + CONFIG("Couldn't read Free information: %s", strerror(errno)); + + ret = fscanf(f, "HugePages_Free: %llu", &count); + if (ret != 1) + CONFIG("Couldn't parse HugePages_Free information"); + + return count; +} + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + int page_size; + int hpage_size; + off_t buggy_offset; + int fd; + void *p, *q; + volatile int *pi; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = gethugepagesize(); + if (hpage_size < 0) + CONFIG("No hugepage kernel support"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First, we make a 2 page sane hugepage mapping. Then we + * memset() it to ensure that the ptes are instantiated for + * it. Then we attempt to replace the second half of the map + * with one at a bogus offset. We leave the first page of + * sane mapping in place to ensure that the corresponding + * pud/pmd/whatever entries aren't cleaned away. It's those + * bad entries which can trigger bad_pud() checks if the + * backout path for the bogus mapping is buggy, which it was + * in some kernels. */ + + verbose_printf("Free hugepages: %lld\n", read_free()); + + verbose_printf("Mapping reference map..."); + /* First get arena of three hpages size, at file offset 4GB */ + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() offset 4GB"); + verbose_printf("%p-%p\n", p, p+2*hpage_size-1); + + verbose_printf("Free hugepages: %lld\n", read_free()); + + /* Instantiate the pages */ + verbose_printf("Instantiating..."); + memset(p, 0, 2*hpage_size); + pi = p; + *pi = RANDOM_CONSTANT; + verbose_printf("done.\n"); + + verbose_printf("Free hugepages: %lld\n", read_free()); + + /* Toggle the permissions on the first page. This forces TLB + * entries (including hash page table on powerpc) to be + * flushed, so that the page tables must be accessed for the + * test further down. In the buggy case, those page tables + * can get thrown away by a pud_clear() */ + err = mprotect(p, hpage_size, PROT_READ); + if (err) + FAIL("mprotect(%p, 0x%x, PROT_READ)", p, hpage_size); + + /* Replace top hpage by hpage mapping at confusing file offset */ + buggy_offset = page_size; + verbose_printf("Replacing map at %p with map from offset 0x%lx...", + p + hpage_size, (unsigned long)buggy_offset); + q = mmap(p + hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, buggy_offset); + if (q != MAP_FAILED) + FAIL("bogus offset mmap() succeeded at %p\n", q); + if (errno != EINVAL) + FAIL("bogus mmap() failed with \"%s\" instead of \"%s\"", + strerror(errno), strerror(EINVAL)); + verbose_printf("%s\n", strerror(errno)); + + verbose_printf("Free hugepages: %lld\n", read_free()); + + if (*pi != RANDOM_CONSTANT) + FAIL("Pre-existing mapping clobbered: %x instead of %x", + *pi, RANDOM_CONSTANT); + + verbose_printf("Free hugepages: %lld\n", read_free()); + + /* The real test is whether we got a bad_pud() or similar + * during the run. The check above, combined with the earlier + * mprotect()s to flush the TLB are supposed to catch it, but + * it's hard to be certain. Once bad_pud() is called + * behaviour can be very strange. */ + PASS_INCONCLUSIVE(); +} -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson ------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel