On 16.11.2006 [10:50:13 +1100], David Gibson wrote: > On Wed, Nov 15, 2006 at 01:09:07PM -0800, Nishanth Aravamudan wrote: > > On 15.11.2006 [10:41:19 +1100], David Gibson wrote: > > > On Tue, Nov 14, 2006 at 02:33:59PM -0800, Nishanth Aravamudan wrote: > > > > On 14.11.2006 [12:21:36 -0800], Nishanth Aravamudan wrote: > > > > > Hi all, > > > > > > > > > > I'm hitting a brick wall debugging the linkshare segfaults I'm seeing. > > > > > > > > > > (These logs are from my 2-way x86_64, but I'm seeing similar issues > > > > > on a G5 > > > > > (ppc64): > > > > > > > > > > HUGETLB_SHARE=2 xB.linkshare 2 (32): PASS > > > > > HUGETLB_SHARE=2 xB.linkshare 2 (64): PASS > > > > > HUGETLB_SHARE=1 xB.linkshare 2 (32): FAIL 2 of 2 children > > > > > exited abnormally > > > > > HUGETLB_SHARE=1 xB.linkshare 2 (64): FAIL 2 of 2 children > > > > > exited abnormally > > > > > HUGETLB_SHARE=2 xBDT.linkshare 2 (32): PASS > > > > > HUGETLB_SHARE=2 xBDT.linkshare 2 (64): PASS > > > > > HUGETLB_SHARE=1 xBDT.linkshare 2 (32): FAIL 2 of 2 children > > > > > exited abnormally > > > > > HUGETLB_SHARE=1 xBDT.linkshare 2 (64): FAIL 2 of 2 children > > > > > exited abnormally > > > > > > > > > > With all 4 failures being segmentation faults we caught. > > > > > > > > /me hangs head in shame. > > > > > > > > This is all probably just a stupid programming error on my part. I'll > > > > have a fix, I think, once I return from class. > > > > > > Btw, some of the existing testcases (e.g. alloc-instantiate-race) use > > > strsignal() and WTERMSIG() to give a more informative message when a > > > child is killed by a signal. It's probably a good idea to use that > > > here too, so you can see they died with a SEGV at first glance. > > > > Yes, this is done with a verbose test run. If I were to do it via a > > FAIL statement, we'd get 3 FAIL lines for every failing case. I > > suppose I could add a FAIL_CONT() for this... > > Um.. I don't really follow you.
If you look at the patch I sent previously, we do print out the signal information with strsignal, but via verbose_printf(). If I were to do so via a FAIL() line, we'd either only print out the signal for the first child (since the testcase would fail immediately), or we'd have to add a FAIL_CONT() or something to allow me to indicate failure without failing the testcase immediately. In any case, I've spent a good amount of time cleaning and fixing the linkshare testcase yesterday and today. Here is what I have so far. We are still getting segfaults on xBDT.linkshare 64-bit with HUGETLB_SHARE=2, but I went and checked and it's not a testcase issue, I don't think. We also will segfault, for instance, if xBDT.linkhuge 64-bit is run manually with HUGETLB_SHARE=2 two times in a row. I am now looking into the root cause of this failure. The patch is pretty much ready for inclusion, I think, but I'd like one more round of review. Signed-off-by: Nishanth Aravamudan <[EMAIL PROTECTED]> tests/linkshare.c | 315 +++++++++++++++++++++++++++++++----------------------- tests/testutils.c | 9 + 2 files changed, 192 insertions(+), 132 deletions(-) diff --git a/tests/linkshare.c b/tests/linkshare.c index 3461a7d..48e80fe 100644 --- a/tests/linkshare.c +++ b/tests/linkshare.c @@ -16,6 +16,7 @@ * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> @@ -24,6 +25,7 @@ #include <time.h> #include <errno.h> #include <limits.h> +#include <string.h> #include <sys/types.h> #include <sys/mman.h> #include <sys/shm.h> @@ -80,6 +82,11 @@ static struct test_entry { #define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) +static int sharing; +static int elfmap_off; +static int shmid; +static ino_t *shm; + static char link_string[32]; static void get_link_string(const char *argv0) @@ -134,11 +141,151 @@ static ino_t do_test(struct test_entry * return get_addr_inode(te->data); } -int main(int argc, char *argv[], char *envp[]) +static int parse_env(int num_sharings) +{ + char *env; + + env = getenv("HUGETLB_ELFMAP"); + if (env && (strcasecmp(env, "no") == 0)) { + verbose_printf("Segment remapping disabled\n"); + elfmap_off = 1; + } else { + env = getenv("HUGETLB_SHARE"); + if (env) + sharing = atoi(env); + verbose_printf("Segment remapping enabled, " + "sharing = %d\n", sharing); + } + + if (num_sharings > 99999) + CONFIG("Too many sharings requested (max = 99999)"); + if (num_sharings <= 0) + CONFIG("Number of sharings requested must be greater " + "than or equal to 0"); + return num_sharings; +} + +static pid_t spawn_child(char *self, int index, int num_sharings) +{ + int ret; + char execarg1[5], execarg2[5]; + + ret = snprintf(execarg1, 5, "%d", num_sharings); + if (ret < 0) + FAIL("snprintf failed: %s", strerror(errno)); + + ret = snprintf(execarg2, 5, "%d", index); + if (ret < 0) + FAIL("snprintf failed: %s", strerror(errno)); + + ret = fork(); + if (ret) { + if (ret < 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("fork failed: %s", + strerror(errno)); + } + } else { + ret = execlp(self, self, execarg1, + execarg2, NULL); + if (ret) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("execl(%s, %s, %s, %s failed: %s", + self, self, execarg1, + execarg2, strerror(errno)); + } + } + + return ret; +} + +static int child_process(char *self, int num_sharings, int index) { int i; - int shmid; - ino_t *shm; + ino_t ino; + + get_link_string(self); + + shmid = shmget(SHM_KEY, num_sharings * NUM_TESTS * + sizeof(ino_t), 0666); + if (shmid < 0) + FAIL("Child's shmget failed: %s", strerror(errno)); + + shm = shmat(shmid, NULL, 0); + if (shm == (void *)-1) + FAIL("shmat failed: %s", strerror(errno)); + + for (i = 0; i < NUM_TESTS; i++) { + if (!test_addr_huge(testtab + i)) { + shm[index * NUM_TESTS + i] = 0; + } else { + ino = do_test(testtab + i); + if ((int)ino < 0) { + shmdt(shm); + exit(RC_FAIL); + } + shm[index * NUM_TESTS + i] = ino; + } + } + shmdt(shm); + return 0; +} + +static void verify_inodes(int num_sharings) +{ + int i, j; + + for (i = 0; i < NUM_TESTS; i++) { + ino_t base = shm[i]; + for (j = 1; j < num_sharings; j++) { + ino_t comp = shm[j * NUM_TESTS + i]; + if (base != comp) { + /* + * we care if we mismatch if + * a) sharing all segments or + * b) sharing only read-only + * segments and this is one + */ + if (sharing == 2 || + (sharing == 1 && testtab[i].writable == 0)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Inodes do not match " + "(%u != %u)", + (int)base, (int)comp); + } + } else { + /* + * we care if we match if + * a) not remapping or + * b) not sharing or + * c) sharing only read-only + * segments and this is not one + * BUT only if the inode is not + * 0 (don't care about the file) + */ + if (base == 0) + continue; + + if (elfmap_off == 1 || sharing == 0 || + (sharing == 1 && testtab[i].writable == 1)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + if (sharing == 1 && testtab[i].writable == 1) + verbose_printf("Incorrectly sharing a writable segment...\n"); + FAIL("Inodes match, but we should not be " + "sharing this segment (%d == %d)", + (int)base, (int)comp); + } + } + } + } +} + +int main(int argc, char *argv[], char *envp[]) +{ int num_sharings; test_init(argc, argv); @@ -148,33 +295,16 @@ int main(int argc, char *argv[], char *e * first process * arg1 = num_sharings */ - char *env; - pid_t *children; - int ret, j; + pid_t *children_pids; + int ret, i; /* both default to 0 */ - int sharing = 0, elfmap_off = 0; + int child_failed = 0; + int status; - env = getenv("HUGETLB_ELFMAP"); - if (env && (strcasecmp(env, "no") == 0)) { - verbose_printf("Segment remapping disabled\n"); - elfmap_off = 1; - } else { - env = getenv("HUGETLB_SHARE"); - if (env) - sharing = atoi(env); - verbose_printf("Segment remapping enabled, " - "sharing = %d\n", sharing); - } + num_sharings = parse_env(atoi(argv[1])); - num_sharings = atoi(argv[1]); - if (num_sharings > 99999) - CONFIG("Too many sharings requested (max = 99999)"); - if (num_sharings <= 0) - CONFIG("Number of sharings requested must be greater " - "than or equal to 0"); - - children = (pid_t *)malloc(num_sharings * sizeof(pid_t)); - if (!children) + children_pids = (pid_t *)malloc(num_sharings * sizeof(pid_t)); + if (!children_pids) FAIL("malloc failed: %s", strerror(errno)); shmid = shmget(SHM_KEY, num_sharings * NUM_TESTS * @@ -187,92 +317,39 @@ int main(int argc, char *argv[], char *e if (shm == (void *)-1) FAIL("shmat failed: %s", strerror(errno)); + for (i = 0; i < num_sharings; i++) + children_pids[i] = spawn_child(argv[0], i, num_sharings); + for (i = 0; i < num_sharings; i++) { - char execarg1[5], execarg2[5]; - ret = snprintf(execarg1, 5, "%d", num_sharings); - if (ret < 0) - FAIL("snprintf failed: %s", strerror(errno)); - - ret = snprintf(execarg2, 5, "%d", i); - if (ret < 0) - FAIL("snprintf failed: %s", strerror(errno)); - - ret = fork(); - if (ret) { - if (ret < 0) - FAIL("fork failed: %s", - strerror(errno)); - children[i] = ret; - } else { - ret = execlp(argv[0], argv[0], execarg1, - execarg2, NULL); - if (ret) { - shmctl(shmid, IPC_RMID, NULL); - shmdt(shm); - FAIL("execl(%s, %s, %s, %s failed: %s", - argv[0], argv[0], execarg1, - execarg2, strerror(errno)); - } - } - } - for (i = 0; i < num_sharings; i++) { - ret = waitpid(children[i], NULL, 0); + ret = waitpid(children_pids[i], &status, 0); if (ret < 0) { shmctl(shmid, IPC_RMID, NULL); shmdt(shm); FAIL("waitpid failed: %s", strerror(errno)); } - } - for (i = 0; i < NUM_TESTS; i++) { - ino_t base = shm[i]; - for (j = 1; j < num_sharings; j++) { - ino_t comp = shm[j * NUM_TESTS + i]; - if (base != comp) { - /* - * we care if we mismatch if - * a) sharing all segments or - * b) sharing only read-only - * segments and this is one - */ - if ((sharing == 2) || - ((sharing == 1) && - (testtab[i].writable == 0))) { - shmctl(shmid, IPC_RMID, NULL); - shmdt(shm); - FAIL("Inodes do not match " - "(%u != %u)", - (int)base, (int)comp); - } - } else { - /* - * we care if we match if - * a) not remapping or - * b) not sharing or - * c) sharing only read-only - * segments and this is not one - */ - if ((elfmap_off == 1) || - (sharing == 0) || - ((sharing == 1) && - (testtab[i].writable == 1) && - (base == -1))) { - shmctl(shmid, IPC_RMID, NULL); - shmdt(shm); - if ((sharing == 1) && - (testtab[i].writable == 1)) - verbose_printf( - "sharing a " - "writable " - "segment...\n"); - FAIL("Inodes match, " - "but we should not be " - "sharing this segment " - "(%d == %d)", - (int)base, (int)comp); - } - } + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + child_failed++; + verbose_printf("Child %d exited with non-zero status: %d\n", + i + 1, WEXITSTATUS(status)); } + + if (WIFSIGNALED(status)) { + child_failed++; + verbose_printf("Child %d killed by signal: %s\n", i + 1, + strsignal(WTERMSIG(status))); + } + } + + if (child_failed) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("%d of %d children exited abnormally", + child_failed, num_sharings); } + + verify_inodes(num_sharings); + shmctl(shmid, IPC_RMID, NULL); shmdt(shm); PASS(); @@ -282,31 +359,9 @@ int main(int argc, char *argv[], char *e * arg1 = num_sharings * arg2 = index + 1 into shared memory array */ - ino_t i1; - int index; - - num_sharings = atoi(argv[1]); - index = atoi(argv[2]); - - get_link_string(argv[0]); - - shmid = shmget(SHM_KEY, num_sharings * NUM_TESTS * - sizeof(ino_t), 0666); - if (shmid < 0) - FAIL("Child's shmget failed: %s", strerror(errno)); - - shm = shmat(shmid, NULL, 0); - if (shm == (void *)-1) - FAIL("shmat failed: %s", strerror(errno)); - - for (i = 0; i < NUM_TESTS; i++) { - i1 = do_test(testtab + i); - if (((int)i1) < 0) - shmdt(shm); - shm[index * NUM_TESTS + i] = i1; - } - shmdt(shm); - exit(0); + child_process(argv[0], atoi(argv[1]), atoi(argv[2])); } else FAIL("Incorrect arguments\n"); + + return 0; } diff --git a/tests/testutils.c b/tests/testutils.c index 11316b6..b39aa75 100644 --- a/tests/testutils.c +++ b/tests/testutils.c @@ -171,15 +171,20 @@ ino_t get_addr_inode(void *p) return -1; } - /* looks like a filename? */ + /* Don't care about non-filenames */ if (name[0] != '/') return 0; /* Truncate the filename portion */ ret = stat(name, &sb); - if (ret < 0) + if (ret < 0) { + /* Don't care about unlinked files */ + if (errno == ENOENT) + return 0; + ERROR("stat failed: %s\n", strerror(errno)); return -1; + } return sb.st_ino; } -- Nishanth Aravamudan <[EMAIL PROTECTED]> IBM Linux Technology Center ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys - and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Libhugetlbfs-devel mailing list Libhugetlbfs-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel