Hello Ying Han, You recently wrote a test program and posted to LKML as:
Subject: ftruncate-mmap: pages are lost after writing to mmaped file, Date: Wed, 18 Mar 2009 12:44:08 -0700 (Thu, 01:14 IST) > We triggered the failure during some internal experiment with > ftruncate/mmap/write/read sequence. And we found that some pages are > "lost" after writing to the mmaped file. which in the following test > cases (count >= 0). > > First we deployed the test cases into group of machines and see about > >20% failure rate on average. Then, I did couple of experiment to try > to reproduce it on a single machine. what i found is that: > 1. add a fsync after write the file, i can not reproduce this issue. > 2. add memory pressure(mmap/mlock) while run the test in infinite > loop, the failure is reproduced quickly. ( background flushing ? ) > > The "bad pages" count differs each time from one digit to 4,5 digit > for 128M ftruncated file. and what i also found that the bad page > number are contiguous for each segment which total bad pages container > several segments. ext "1-4, 9-20, 48-50" ( batch flushing ? ) > > (The failure is reproduced based on 2.6.29-rc8, also happened on > 2.6.18 kernel. . Here is the simple test case to reproduce it with > memory pressure. ) I would like to add this test as part of LTP(http://ltp.sourceforge.net/) with the following patch, and with your due permission. If you do not have any issue, can you please reply to this mail with a Sign-Off. I have made couple of changes in your test program to make to run for some hours, minutes & time. I would also like to request you to kindly let us know any test program you develop for the Linux Kernel in future with a: Cc: Subrata Modak <[email protected]>, Cc: ltp-list <[email protected]>, Ported-To-And-Tested-On-LTP-By: Subrata Modak <[email protected]>, --- --- ltp-full-20090331.orig/runtest/stress.part1 2009-04-01 18:23:55.000000000 +0530 +++ ltp-full-20090331/runtest/stress.part1 2009-04-01 19:44:34.000000000 +0530 @@ -18,6 +18,12 @@ mtest06 mmap1 -x 0.05 mem02 mem02 +# Test for mmap() page corruption. This test is meant for +# 1 hour and more. Please change -h(hour), -m(minute) & +# -s(seconds) settings, if default not desired +mmap-corruption01 mmap-corruption01 -h1 -m1 -s1 + + page01 page01 page02 page02 --- ltp-full-20090331.orig/testcases/kernel/mem/mmapstress/mmap-corruption01.c 1970-01-01 05:30:00.000000000 +0530 +++ ltp-full-20090331/testcases/kernel/mem/mmapstress/mmap-corruption01.c 2009-04-01 19:38:20.000000000 +0530 @@ -0,0 +1,185 @@ +/******************************************************************************/ +/* */ +/* Copyright (s) Ying Han <[email protected]>, 2009 */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See */ +/* the GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +/* */ +/******************************************************************************/ +/* + ftruncate-mmap: pages are lost after writing to mmaped file, + + We triggered the failure during some internal experiment with + ftruncate/mmap/write/read sequence. And we found that some pages are + "lost" after writing to the mmaped file. which in the following test + cases (count >= 0). + + First we deployed the test cases into group of machines and see about + >20% failure rate on average. Then, I did couple of experiment to try + to reproduce it on a single machine. what i found is that: + 1. add a fsync after write the file, i can not reproduce this issue. + 2. add memory pressure(mmap/mlock) while run the test in infinite + loop, the failure is reproduced quickly. ( background flushing ? ) + + The "bad pages" count differs each time from one digit to 4,5 digit + for 128M ftruncated file. and what i also found that the bad page + number are contiguous for each segment which total bad pages container + several segments. ext "1-4, 9-20, 48-50" ( batch flushing ? ) + + (The failure is reproduced based on 2.6.29-rc8, also happened on + 2.6.18 kernel. . Here is the simple test case to reproduce it with + memory pressure. ) +*/ + +#include <sys/mman.h> +#include <sys/types.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> + +/* Harness Specific Include Files. */ +#include "test.h" +#include "usctest.h" + +/* Extern Global Variables */ +extern int Tst_count; /* counter for tst_xxx routines. */ +extern char *TESTDIR; /* temporary dir created by tst_tmpdir() */ + +/* Global Variables */ +char *TCID = "mmap-corruption01"; /* test program identifier. */ +int TST_TOTAL = 1; /* total number of tests in this file. */ + + +long kMemSize = 128 << 20; +int kPageSize = 4096; + +char *usage="-h hours -m minutes -s secs\n"; + +int anyfail() +{ + tst_resm(TFAIL, "Test failed\n"); + tst_rmdir(); + tst_exit(); +} + +int main(int argc, char **argv) { + char *progname; + int status; + int count = 0; + int i, c; + char *fname = "test.mmap-corruption"; + char *mem; + unsigned long alarmtime = 0; + struct sigaction sa; + void finish(int sig); + + progname = *argv; + while ((c = getopt(argc, argv, ":h:m:s:")) != -1) { + switch (c) { + case 'h': + alarmtime += atoi(optarg) * 60 * 60; + break; + case 'm': + alarmtime += atoi(optarg) * 60; + break; + case 's': + alarmtime += atoi(optarg); + break; + default: + (void)fprintf(stderr, "usage: %s %s\n", progname, + usage); + anyfail(); + } + } + + /* + * Plan for death by signal. User may have specified + * a time limit, in which case set an alarm and catch SIGALRM. + * Also catch and cleanup with SIGINT, SIGQUIT, and SIGTERM. + */ + sa.sa_handler = finish; + sa.sa_flags = 0; + if (sigemptyset(&sa.sa_mask)) { + perror("sigempty error"); + exit(1); + } + + if (sigaction(SIGINT, &sa, 0) == -1) { + perror("sigaction error SIGINT"); + exit(1); + } + if (alarmtime) { + if (sigaction(SIGALRM, &sa, 0) == -1) { + perror("sigaction error"); + exit(1); + } + (void)alarm(alarmtime); + printf("mmap-corruption will run for=> %ld, seconds\n",alarmtime); + } else { //Run for 5 secs only + if (sigaction(SIGALRM, &sa, 0) == -1) { + perror("sigaction error"); + exit(1); + } + (void)alarm(5); + printf("mmap-corruption will run for=> 5, seconds\n"); + } + /* If we get a SIGQUIT or SIGTERM, clean up and exit immediately. */ + sa.sa_handler = finish; + if (sigaction(SIGQUIT, &sa, 0) == -1) { + perror("sigaction error SIGQUIT"); + exit(1); + } + if (sigaction(SIGTERM, &sa, 0) == -1) { + perror("sigaction error SIGTERM"); + exit(1); + } + + + tst_tmpdir(); + while (1) { + unlink(fname); + int fd = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); + status = ftruncate(fd, kMemSize); + + mem = mmap(0, kMemSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + // Fill the memory with 1s. + memset(mem, 1, kMemSize); + + for (i = 0; i < kMemSize; i++) { + int byte_good = mem[i] != 0; + if (!byte_good && ((i % kPageSize) == 0)) { + //printf("%d ", i / kPageSize); + count++; + } + } + munmap(mem, kMemSize); + close(fd); + unlink(fname); + if (count > 0) { + printf("Running %d bad page\n", count); + return 1; + } + count=0; + } + return 0; +} + +void finish(int sig) { + printf("mmap-corruption PASSED\n"); + exit(0); +} + --- Regards-- Subrata ------------------------------------------------------------------------------ _______________________________________________ Ltp-list mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/ltp-list
