http://linux.derkeiler.com/Mailing-Lists/Kernel/2008-01/msg01289.htmlRe: Improve hackbench
Ingo Molnar wrote: * Zhang, Yanmin <[EMAIL PROTECTED]> wrote: On x86-64 there's a bug [*], that causes hackbench to segfault when compiled with optimizations: in reap_worker(): int status; ... pthread_join(id, (void **)(void *)&status); That is not correct, sizeof(void*) > sizeof(int) on x86-64. Something gets overwritten on the stack, I tried with gcc -fstack-protector, but it doesn't detect it !? After applying the patch, it no longer segfaults. This patch fixes it: --- hackbench.c 2008-01-04 10:08:26.000000000 +0200 +++ ../hackbench.c 2008-01-04 13:45:22.000000000 +0200 @@ -241,8 +241,10 @@ wait(&status); if (!WIFEXITED(status)) exit(1); - } else - pthread_join(id, (void **)(void *)&status); + } else { + void* status; + pthread_join(id, (void **)&status); + } } /* One group of senders and receivers */ ---------------- I also notice that the thread version is slower, than process version: $ ./hackbench 5 thread Running with 5*40 (== 200) tasks. Time: 0.413 $ ./hackbench 5 thread Running with 5*40 (== 200) tasks. Time: 0.423 $ ./hackbench 5 thread 20 Running with 5*40 (== 200) tasks. Time: 0.093 $ ./hackbench 5 thread 200 Running with 5*40 (== 200) tasks. Time: 0.827 $ ./hackbench 5 thread 2000 Running with 5*40 (== 200) tasks. Time: 8.409 $ ./hackbench 5 process 2000 Running with 5*40 (== 200) tasks. Time: 7.669 $ ./hackbench -pipe 5 process 2000 Running with 5*40 (== 200) tasks. Time: 3.416 $ ./hackbench -pipe 5 thread 2000 Running with 5*40 (== 200) tasks. Time: 4.320 [*] $ uname -a Linux lightspeed2 2.6.24-rc6-ge697789d #3 Wed Jan 2 11:15:05 EET 2008 x86_64 GNU/Linux $ gcc -v Using built-in specs. Target: x86_64-linux-gnu Configured with: ../src/configure -v --enable-languages=c,c++,fortran,objc,obj-c++,treelang --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.2 --program-suffix=-4.2 --enable-clocale=gnu --enable-libstdcxx-debug --enable-mpfr --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix gcc version 4.2.3 20071123 (prerelease) (Debian 4.2.2-4) $ wget http://redhat.com/~mingo/cfs-scheduler/tools/hackbench.c --13:40:53-- http://redhat.com/~mingo/cfs-scheduler/tools/hackbench.c => `hackbench.c' Resolving redhat.com... 209.132.177.50 Connecting to redhat.com|209.132.177.50|:80... connected. HTTP request sent, awaiting response... 302 Found Location: http://www.redhat.com/~mingo/cfs-scheduler/tools/hackbench.c [following] --13:40:54-- http://www.redhat.com/~mingo/cfs-scheduler/tools/hackbench.c => `hackbench.c' Resolving www.redhat.com... 209.132.177.50 Connecting to www.redhat.com|209.132.177.50|:80... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: http://people.redhat.com/mingo/cfs-scheduler/tools/hackbench.c [following] --13:40:54-- http://people.redhat.com/mingo/cfs-scheduler/tools/hackbench.c => `hackbench.c' Resolving people.redhat.com... 66.187.233.237 Connecting to people.redhat.com|66.187.233.237|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 8,455 (8.3K) [text/plain] 100%[====================================================================================================================>] 8,455 --.--K/s 13:40:55 (61.93 KB/s) - `hackbench.c' saved [8455/8455] $ gcc -O2 -g -Wall -o hackbench hackbench.c -lpthread hackbench.c:32:66: warning: missing terminating ' character $ ./hackbench 1 thread Running with 1*40 (== 40) tasks. Segmentation fault $ valgrind --trace-children=yes ./hackbench 1 thread ==27332== Memcheck, a memory error detector. ==27332== Copyright (C) 2002-2007, and GNU GPL'd, by Julian Seward et al. ==27332== Using LibVEX rev 1804, a library for dynamic binary translation. ==27332== Copyright (C) 2004-2007, and GNU GPL'd, by OpenWorks LLP. ==27332== Using valgrind-3.3.0-Debian, a dynamic binary instrumentation framework. ==27332== Copyright (C) 2000-2007, and GNU GPL'd, by Julian Seward et al. ==27332== For more details, rerun with: -v ==27332== Running with 1*40 (== 40) tasks. ==27332== Thread 2: ==27332== Syscall param write(buf) points to uninitialised byte(s) ==27332== at 0x4C1854B: (within /usr/lib/debug/libpthread-2.7.so) ==27332== by 0x400C34: ready (hackbench.c:138) ==27332== by 0x400C97: receiver (hackbench.c:182) ==27332== by 0x4C113F6: start_thread (pthread_create.c:297) ==27332== by 0x4EFD91C: clone (in /usr/lib/debug/libc-2.7.so) ==27332== Address 0x558a09f is on thread 2's stack ==27332== ==27332== Thread 22: ==27332== Syscall param write(buf) points to uninitialised byte(s) ==27332== at 0x4C1854B: (within /usr/lib/debug/libpthread-2.7.so) ==27332== by 0x400C34: ready (hackbench.c:138) ==27332== by 0x400D33: sender (hackbench.c:152) ==27332== by 0x4C113F6: start_thread (pthread_create.c:297) ==27332== by 0x4EFD91C: clone (in /usr/lib/debug/libc-2.7.so) ==27332== Address 0x55da07f is on thread 22's stack ==27332== ==27332== Thread 40: ==27332== Syscall param write(buf) points to uninitialised byte(s) ==27332== at 0x4C1854B: (within /usr/lib/debug/libpthread-2.7.so) ==27332== by 0x400D6E: sender (hackbench.c:160) ==27332== by 0x4C113F6: start_thread (pthread_create.c:297) ==27332== by 0x4EFD91C: clone (in /usr/lib/debug/libc-2.7.so) ==27332== Address 0x56220a0 is on thread 40's stack ==27332== ==27332== Thread 1: ==27332== Jump to the invalid address stated on the next line ==27332== at 0x0: ??? ==27332== by 0x518702F: ??? ==27332== by 0xFFFFFFFF: ??? ==27332== by 0x518702F: ??? ==27332== by 0x2800000000: ??? ==27332== Address 0x0 is not stack'd, malloc'd or (recently) free'd ==27332== ==27332== Process terminating with default action of signal 11 (SIGSEGV) ==27332== Bad permissions for mapped region at address 0x0 ==27332== at 0x0: ??? ==27332== by 0x518702F: ??? ==27332== by 0xFFFFFFFF: ??? ==27332== by 0x518702F: ??? ==27332== by 0x2800000000: ??? ==27332== ==27332== ERROR SUMMARY: 40041 errors from 4 contexts (suppressed: 8 from 1) ==27332== malloc/free: in use at exit: 11,420 bytes in 61 blocks. ==27332== malloc/free: 62 allocs, 1 frees, 11,692 bytes allocated. ==27332== For counts of detected errors, rerun with: -v ==27332== searching for pointers to 61 not-freed blocks. ==27332== checked 560,688 bytes. ==27332== ==27332== LEAK SUMMARY: ==27332== definitely lost: 20 bytes in 1 blocks. ==27332== possibly lost: 10,608 bytes in 39 blocks. ==27332== still reachable: 792 bytes in 21 blocks. ==27332== suppressed: 0 bytes in 0 blocks. ==27332== Rerun with --leak-check=full to see details of leaked memory. Segmentation fault Best regards, /* * This is the latest version of hackbench.c, that tests scheduler and * unix-socket (or pipe) performance. * * Usage: hackbench [-pipe] <num groups> [process|thread] [loops] * * Build it with: * gcc -g -Wall -O2 -o hackbench hackbench.c -lpthread */ #if 0 Date: Fri, 04 Jan 2008 14:06:26 +0800 From: "Zhang, Yanmin" <[EMAIL PROTECTED]> To: LKML <[EMAIL PROTECTED]> Subject: Improve hackbench Cc: Ingo Molnar <[EMAIL PROTECTED]>, Arjan van de Ven <[EMAIL PROTECTED]> hackbench tests the Linux scheduler. The original program is at http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c Based on this multi-process version, a nice person created a multi-thread version. Pls. see http://www.bullopensource.org/posix/pi-futex/hackbench_pth.c When I integrated them into my automation testing system, I found a couple of issues and did some improvements. 1) Merge hackbench: I integrated hackbench_pth.c into hackbench and added a new parameter which can be used to choose process mode or thread mode. The default mode is process. 2) It runs too fast and ends in a couple of seconds. Sometimes it's too hard to debug the issues. On my ia64 Montecito machines, the result looks weird when comparing process mode and thread mode. I want a stable result and hope the testing could run for a stable longer time, so I might use performance tools to debug issues. I added another new parameter,`loops`, which can be used to change variable loops, so more messages will be passed from writers to receivers. Parameter 'loops' is equal to 100 by default. For example on my 8-core x86_64: [EMAIL PROTECTED] hackbench]$ uname -a Linux lkp-st01-x8664 2.6.24-rc6 #1 SMP Fri Dec 21 08:32:31 CST 2007 x86_64 x86_64 x86_64 GNU/Linux [EMAIL PROTECTED] hackbench]$ ./hackbench Usage: hackbench [-pipe] <num groups> [process|thread] [loops] [EMAIL PROTECTED] hackbench]$ ./hackbench 150 process 1000 Time: 151.533 [EMAIL PROTECTED] hackbench]$ ./hackbench 150 thread 1000 Time: 153.666 With the same new parameters, I did captured the SLUB issue discussed on LKML recently. 3) hackbench_pth.c will fail on ia64 machine because pthread_attr_setstacksize always fails if the stack size is less than 196*1024. I moved this statement within a __ia64__ check. This new program could be compiled with command line: #gcc -g -Wall -o hackbench hackbench.c -lpthread Thank Ingo for his great comments! -yanmin --- * Nathan Lynch <[EMAIL PROTECTED]> wrote: > Here's a fixlet for the hackbench program found at > > http://people.redhat.com/mingo/cfs-scheduler/tools/hackbench.c > > When redirecting hackbench output I am seeing multiple copies of the > "Running with %d*40 (== %d) tasks" line. Need to flush the buffered > output before forking. #endif /* Test groups of 20 processes spraying to 20 receivers */ #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <unistd.h> #include <sys/types.h> #include <sys/socket.h> #include <sys/wait.h> #include <sys/time.h> #include <sys/poll.h> #include <limits.h> #define DATASIZE 100 static unsigned int loops = 100; /* * 0 means thread mode and others mean process (default) */ static unsigned int process_mode = 1; static int use_pipes = 0; struct sender_context { unsigned int num_fds; int ready_out; int wakefd; int out_fds[0]; }; struct receiver_context { unsigned int num_packets; int in_fds[2]; int ready_out; int wakefd; }; static void barf(const char *msg) { fprintf(stderr, "%s (error: %s)\n", msg, strerror(errno)); exit(1); } static void print_usage_exit() { printf("Usage: hackbench [-pipe] <num groups> [process|thread] [loops]\n"); exit(1); } static void fdpair(int fds[2]) { if (use_pipes) { if (pipe(fds) == 0) return; } else { if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0) return; } barf("Creating fdpair"); } /* Block until we're ready to go */ static void ready(int ready_out, int wakefd) { char dummy; struct pollfd pollfd = { .fd = wakefd, .events = POLLIN }; /* Tell them we're ready. */ if (write(ready_out, &dummy, 1) != 1) barf("CLIENT: ready write"); /* Wait for "GO" signal */ if (poll(&pollfd, 1, -1) != 1) barf("poll"); } /* Sender sprays loops messages down each file descriptor */ static void *sender(struct sender_context *ctx) { char data[DATASIZE]; unsigned int i, j; ready(ctx->ready_out, ctx->wakefd); /* Now pump to every receiver. */ for (i = 0; i < loops; i++) { for (j = 0; j < ctx->num_fds; j++) { int ret, done = 0; again: ret = write(ctx->out_fds[j], data + done, sizeof(data)-done); if (ret < 0) barf("SENDER: write"); done += ret; if (done < sizeof(data)) goto again; } } return NULL; } /* One receiver per fd */ static void *receiver(struct receiver_context* ctx) { unsigned int i; if (process_mode) close(ctx->in_fds[1]); /* Wait for start... */ ready(ctx->ready_out, ctx->wakefd); /* Receive them all */ for (i = 0; i < ctx->num_packets; i++) { char data[DATASIZE]; int ret, done = 0; again: ret = read(ctx->in_fds[0], data + done, DATASIZE - done); if (ret < 0) barf("SERVER: read"); done += ret; if (done < DATASIZE) goto again; } return NULL; } pthread_t create_worker(void *ctx, void *(*func)(void *)) { pthread_attr_t attr; pthread_t childid; int err; if (process_mode) { /* process mode */ /* Fork the receiver. */ switch (fork()) { case -1: barf("fork()"); case 0: (*func) (ctx); exit(0); } return (pthread_t) 0; } if (pthread_attr_init(&attr) != 0) barf("pthread_attr_init:"); #ifndef __ia64__ if (pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN) != 0) barf("pthread_attr_setstacksize"); #endif if ((err=pthread_create(&childid, &attr, func, ctx)) != 0) { fprintf(stderr, "pthread_create failed: %s (%d)\n", strerror(err), err); exit(-1); } return (childid); } void reap_worker(pthread_t id) { int status; if (process_mode) { /* process mode */ wait(&status); if (!WIFEXITED(status)) exit(1); } else { void *status; pthread_join(id, &status); } } /* One group of senders and receivers */ static unsigned int group(pthread_t *pth, unsigned int num_fds, int ready_out, int wakefd) { unsigned int i; struct sender_context* snd_ctx = malloc (sizeof(struct sender_context) +num_fds*sizeof(int)); for (i = 0; i < num_fds; i++) { int fds[2]; struct receiver_context* ctx = malloc (sizeof(*ctx)); if (!ctx) barf("malloc()"); /* Create the pipe between client and server */ fdpair(fds); ctx->num_packets = num_fds*loops; ctx->in_fds[0] = fds[0]; ctx->in_fds[1] = fds[1]; ctx->ready_out = ready_out; ctx->wakefd = wakefd; pth[i] = create_worker(ctx, (void *)(void *)receiver); snd_ctx->out_fds[i] = fds[1]; if (process_mode) close(fds[0]); } /* Now we have all the fds, fork the senders */ for (i = 0; i < num_fds; i++) { snd_ctx->ready_out = ready_out; snd_ctx->wakefd = wakefd; snd_ctx->num_fds = num_fds; pth[num_fds+i] = create_worker(snd_ctx, (void *)(void *)sender); } /* Close the fds we have left */ if (process_mode) for (i = 0; i < num_fds; i++) close(snd_ctx->out_fds[i]); /* Return number of children to reap */ return num_fds * 2; } int main(int argc, char *argv[]) { unsigned int i, num_groups = 10, total_children; struct timeval start, stop, diff; unsigned int num_fds = 20; int readyfds[2], wakefds[2]; char dummy; pthread_t *pth_tab; if (argv[1] && strcmp(argv[1], "-pipe") == 0) { use_pipes = 1; argc--; argv++; } if (argc >= 2 && (num_groups = atoi(argv[1])) == 0) print_usage_exit(); printf("Running with %d*40 (== %d) tasks.\n", num_groups, num_groups*40); fflush(NULL); if (argc > 2) { if ( !strcmp(argv[2], "process") ) process_mode = 1; else if ( !strcmp(argv[2], "thread") ) process_mode = 0; else print_usage_exit(); } if (argc > 3) loops = atoi(argv[3]); pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t)); if (!pth_tab) barf("main:malloc()"); fdpair(readyfds); fdpair(wakefds); total_children = 0; for (i = 0; i < num_groups; i++) total_children += group(pth_tab+total_children, num_fds, readyfds[1], wakefds[0]); /* Wait for everyone to be ready */ for (i = 0; i < total_children; i++) if (read(readyfds[0], &dummy, 1) != 1) barf("Reading for readyfds"); gettimeofday(&start, NULL); /* Kick them off */ if (write(wakefds[1], &dummy, 1) != 1) barf("Writing to start them"); /* Reap them all */ for (i = 0; i < total_children; i++) reap_worker(pth_tab[i]); gettimeofday(&stop, NULL); /* Print time... */ timersub(&stop, &start, &diff); printf("Time: %lu.%03lu\n", diff.tv_sec, diff.tv_usec/1000); exit(0); } |
