Bug#528818: fdm log for debian bug 528818
Okay, I have committed some parts of the diff and made a couple of other tweaks which were sometimes causing it to hang. Please test the diff below instead. Index: fdm.c === RCS file: /cvsroot/fdm/fdm/fdm.c,v retrieving revision 1.183 diff -u -p -r1.183 fdm.c --- fdm.c 25 May 2009 21:47:23 - 1.183 +++ fdm.c 26 May 2009 06:08:03 - @@ -44,9 +44,11 @@ const char *malloc_options = AFGJPRX; voidsighandler(int); struct child *check_children(struct children *, u_int *); +int wait_children(struct children *, struct children *); struct conf conf; +volatile sig_atomic_t sigchld; volatile sig_atomic_t sigusr1; volatile sig_atomic_t sigint; volatile sig_atomic_t sigterm; @@ -67,6 +69,9 @@ sighandler(int sig) case SIGTERM: sigterm = 1; break; + case SIGCHLD: + sigchld = 1; + break; } } @@ -181,6 +186,79 @@ check_children(struct children *children return (NULL); } +/* Wait for a child and deal with its exit. */ +int +wait_children(struct children *children, struct children *dead_children) +{ + struct child*child, *child2; + pid_tpid; + int status, retcode = 0; + u_inti, j; + + for (;;) { + log_debug3(parent: waiting for children); + /* Wait for a child. */ + switch (pid = waitpid(WAIT_ANY, status, WNOHANG)) { + case 0: + return (0); + case -1: + if (errno == ECHILD) + return (0); + fatal(waitpid failed); + } + + /* Handle the exit status. */ + if (WIFSIGNALED(status)) { + retcode = 1; + log_debug2(parent: child %ld got signal %d, + (long) pid, WTERMSIG(status)); + } else if (!WIFEXITED(status)) { + retcode = 1; + log_debug2(parent: child %ld exited badly, + (long) pid); + } else { + if (WEXITSTATUS(status) != 0) + retcode = 1; + log_debug2(parent: child %ld returned %d, + (long) pid, WEXITSTATUS(status)); + } + + /* Find this child. */ + child = NULL; + for (i = 0; i ARRAY_LENGTH(children); i++) { + child = ARRAY_ITEM(children, i); + if (pid == child-pid) + break; + } + if (i == ARRAY_LENGTH(children)) { + log_debug2(parent: unidentified child %ld, + (long) pid); + continue; + } + + if (child-io != NULL) { + io_close(child-io); + io_free(child-io); + child-io = NULL; + } + ARRAY_REMOVE(children, i); + ARRAY_ADD(dead_children, child); + + /* If this child was the parent of any others, kill them too. */ + for (j = 0; j ARRAY_LENGTH(children); j++) { + child2 = ARRAY_ITEM(children, j); + if (child2-parent != child) + continue; + + log_debug2(parent: child %ld died: killing %ld, + (long) child-pid, (long) child2-pid); + kill(child2-pid, SIGTERM); + } + } + + return (retcode); +} + __dead void usage(void) { @@ -198,7 +276,6 @@ main(int argc, char **argv) enum fdmop op = FDMOP_NONE; const char *proxy = NULL, *s; char tmp[BUFSIZ], *ptr, *lock = NULL, *user, *home = NULL; - long n; struct utsname un; struct passwd *pw; struct stat sb; @@ -207,8 +284,8 @@ main(int argc, char **argv) TAILQ_HEAD(, account) actaq; /* active accounts */ pid_tpid; struct children children, dead_children; - struct child*child, *child2; - struct io *rio; + struct child*child; + struct io *dead_io; struct iolistiol; double tim; struct sigaction act; @@ -218,6 +295,7 @@ main(int argc, char **argv) struct strings macros; struct child_fetch_data *cfd; struct userdata *ud; + sigset_t sigset; #ifdef DEBUG struct rule *r; struct action *t; @@ -676,21 +754,29 @@
Bug#528818: fdm log for debian bug 528818
On Tue, May 26, 2009 at 07:10:05AM +0100, Nicholas Marriott wrote: Okay, I have committed some parts of the diff and made a couple of other tweaks which were sometimes causing it to hang. Please test the diff below instead. This is better, please try this: Index: fdm.c === RCS file: /cvsroot/fdm/fdm/fdm.c,v retrieving revision 1.183 diff -u -p -r1.183 fdm.c --- fdm.c 25 May 2009 21:47:23 - 1.183 +++ fdm.c 26 May 2009 06:23:30 - @@ -44,9 +44,12 @@ const char *malloc_options = AFGJPRX; voidsighandler(int); struct child *check_children(struct children *, u_int *); +int wait_children( +struct children *, struct children *, int); struct conf conf; +volatile sig_atomic_t sigchld; volatile sig_atomic_t sigusr1; volatile sig_atomic_t sigint; volatile sig_atomic_t sigterm; @@ -67,6 +70,9 @@ sighandler(int sig) case SIGTERM: sigterm = 1; break; + case SIGCHLD: + sigchld = 1; + break; } } @@ -181,6 +187,81 @@ check_children(struct children *children return (NULL); } +/* Wait for a child and deal with its exit. */ +int +wait_children( +struct children *children, struct children *dead_children, int no_hang) +{ + struct child*child, *child2; + pid_tpid; + int status, flags, retcode = 0; + u_inti, j; + + flags = no_hang ? WNOHANG : 0; + for (;;) { + log_debug3(parent: waiting for children); + /* Wait for a child. */ + switch (pid = waitpid(WAIT_ANY, status, flags)) { + case 0: + return (0); + case -1: + if (errno == ECHILD) + return (0); + fatal(waitpid failed); + } + + /* Handle the exit status. */ + if (WIFSIGNALED(status)) { + retcode = 1; + log_debug2(parent: child %ld got signal %d, + (long) pid, WTERMSIG(status)); + } else if (!WIFEXITED(status)) { + retcode = 1; + log_debug2(parent: child %ld exited badly, + (long) pid); + } else { + if (WEXITSTATUS(status) != 0) + retcode = 1; + log_debug2(parent: child %ld returned %d, + (long) pid, WEXITSTATUS(status)); + } + + /* Find this child. */ + child = NULL; + for (i = 0; i ARRAY_LENGTH(children); i++) { + child = ARRAY_ITEM(children, i); + if (pid == child-pid) + break; + } + if (i == ARRAY_LENGTH(children)) { + log_debug2(parent: unidentified child %ld, + (long) pid); + continue; + } + + if (child-io != NULL) { + io_close(child-io); + io_free(child-io); + child-io = NULL; + } + ARRAY_REMOVE(children, i); + ARRAY_ADD(dead_children, child); + + /* If this child was the parent of any others, kill them too. */ + for (j = 0; j ARRAY_LENGTH(children); j++) { + child2 = ARRAY_ITEM(children, j); + if (child2-parent != child) + continue; + + log_debug2(parent: child %ld died: killing %ld, + (long) child-pid, (long) child2-pid); + kill(child2-pid, SIGTERM); + } + } + + return (retcode); +} + __dead void usage(void) { @@ -198,7 +279,6 @@ main(int argc, char **argv) enum fdmop op = FDMOP_NONE; const char *proxy = NULL, *s; char tmp[BUFSIZ], *ptr, *lock = NULL, *user, *home = NULL; - long n; struct utsname un; struct passwd *pw; struct stat sb; @@ -207,8 +287,8 @@ main(int argc, char **argv) TAILQ_HEAD(, account) actaq; /* active accounts */ pid_tpid; struct children children, dead_children; - struct child*child, *child2; - struct io *rio; + struct child*child; + struct io *dead_io; struct iolistiol; double tim; struct sigaction act; @@ -676,21 +756,29 @@ main(int argc, char **argv)
Bug#528818: fdm log for debian bug 528818
On Tuesday 26 May 2009 11:54:26 Nicholas Marriott wrote: Okay, I have committed some parts of the diff and made a couple of other tweaks which were sometimes causing it to hang. Please test the diff below instead. This is better, please try this: I've patched and built the test package. It is currently running. Will update your the results once it triggers. PS: I removed the cat related rules from my fdm.conf file. Ritesh -- Ritesh Raj Sarraf RESEARCHUT - http://www.researchut.com Necessity is the mother of invention. signature.asc Description: This is a digitally signed message part.
Bug#528818: fdm log for debian bug 528818
On Monday 25 May 2009 11:16:26 Ritesh Raj Sarraf wrote: On Sunday 24 May 2009 20:31:18 Nicholas Marriott wrote: This diff makes several changes which hopefully should make it a bit more robust and correctly handle the various child processes on exit. Please test. Hi Nicholas, Can you please send it in patch format ? That way I can easily apply it on the Debian package and test it. I copy/pasted your diff and it fails to apply. Looks like the CVS version is too different from the latest one in Debian. Will try to build the cvs version. Ritesh -- Ritesh Raj Sarraf RESEARCHUT - http://www.researchut.com Necessity is the mother of invention. signature.asc Description: This is a digitally signed message part.
Bug#528818: fdm log for debian bug 528818
Hi Nicholas, On Monday 25 May 2009 13:20:40 Ritesh Raj Sarraf wrote: I copy/pasted your diff and it fails to apply. Looks like the CVS version is too different from the latest one in Debian. Will try to build the cvs version. I'm re-running fdm with the patch you provided. Will provide you the logs once it is triggered again. Thanks, Ritesh -- Ritesh Raj Sarraf RESEARCHUT - http://www.researchut.com Necessity is the mother of invention. signature.asc Description: This is a digitally signed message part.
Bug#528818: fdm log for debian bug 528818
Hi Nicholas, On Monday 25 May 2009 14:07:47 Ritesh Raj Sarraf wrote: On Monday 25 May 2009 13:20:40 Ritesh Raj Sarraf wrote: I copy/pasted your diff and it fails to apply. Looks like the CVS version is too different from the latest one in Debian. Will try to build the cvs version. I'm re-running fdm with the patch you provided. Will provide you the logs once it is triggered again. Looks like the bug is found. But the stale lock was not present. fdm is dying while fetching mails. Looks like a child process creation problem. gmail-researchut: cat: out: = gmail-researchut: cat: out: gmail-researchut: cat: out: if [ $1 =3D status ] ; then gmail-researchut: cat: out: # Display a status report. gmail-researchut: cat: out: - log MSG Mounts: gmail-researchut: cat: out: + log STATUS Mounts: gmail-researchut: cat: out: mount | sed s/^/ / gmail-researchut: cat: out: -log MSG gmail-researchut: cat: out: - log MSG Drive power status: gmail-researchut: cat: out: +log STATUS gmail-researchut: cat: out: + log STATUS Drive power status: gmail-researchut: cat: out: for disk in $HD; do gmail-researchut: cat: out: if [ -r $disk ]; then gmail-researchut: cat: out: hdparm -C $disk 2/dev/null | sed s/^/ / gmail-researchut: cat: out: else gmail-researchut: cat: out: - log MSGCannot read $disk, permission denied - $0 needs to be run= gmail-researchut: cat: out: as root gmail-researchut: cat: out: + log STATUSCannot read $disk, permission denied - $0 needs to be = gmail-researchut: cat: out: run as root gmail-researchut: cat: out: fi gmail-researchut: cat: out: done gmail-researchut: cat: out: -log MSG gmail-researchut: cat: out: - log MSG (NOTE: drive settings affected by Laptop Mode cannot be retrie= gmail-researchut: cat: out: ved.) gmail-researchut: cat: out: +log STATUS gmail-researchut: cat: out: + log STATUS (NOTE: drive settings affected by Laptop Mode cannot be ret= gmail-researchut: cat: out: rieved.) gmail-researchut: cat: out: = gmail-researchut: cat: out: gmail-researchut: cat: out: -log MSG gmail-researchut: cat: out: - log MSG Readahead states: gmail-researchut: cat: out: +log STATUS gmail-researchut: cat: out: + log STATUS Readahead states: gmail-researchut: cat: out: cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do gmail-researchut: cat: out: # skip funny stuff gmail-researchut: cat: out: case $FST in = gmail-researchut: cat: out: gmail-researchut: cat: out: @@ -381,29 +381,29 @@ gmail-researchut: cat: out: esac gmail-researchut: cat: out: if [ -b $DEV ] ; then gmail-researchut: cat: out: if [ -r $DEV ] ; then gmail-researchut: cat: out: - log MSG$DEV: $((`blockdev --getra $DEV` / 2)) kB gmail-researchut: cat: out: + log STATUS$DEV: $((`blockdev --getra $DEV` / 2)) kB gmail-researchut: cat: out: else gmail-researchut:
Bug#528818: fdm log for debian bug 528818
Okay, this is the problem: gmail-researchut: spamprobe receive: io: poll: Connection timed out Your pipe process does not return in the timeout, so fdm aborts the fetch. Unfortunately this then causes various things which sometimes end up with fdm not exiting correctly. fdm's management of child processes isn't always very good, particularly when an error occurs - sometimes stuff doesn't die in the right order and it ends up with fatal errors. The whole multiprocess idea is to allow fdm to run lots of things in parallel and to drop privileges when running as root, but I'm not sure it is worth it now, I should really simplify it a lot and probably forget about using it as root. You can see similar problems with a configuration file such as: set timeout 2 account 'stdin' stdin match exec 'sleep 10' returns (0, ) action drop And then eg echo|fdm - f This diff makes several changes which hopefully should make it a bit more robust and correctly handle the various child processes on exit. Please test. Index: child-deliver.c === RCS file: /cvsroot/fdm/fdm/child-deliver.c,v retrieving revision 1.21 diff -u -p -r1.21 child-deliver.c --- child-deliver.c 17 May 2009 19:20:08 - 1.21 +++ child-deliver.c 24 May 2009 14:56:51 - @@ -61,10 +61,10 @@ child_deliver(struct child *child, struc if (privsep_send(pio, msg, msgbuf) != 0) fatalx(privsep_send error); - if (privsep_recv(pio, msg, NULL) != 0) - fatalx(privsep_recv error); - if (msg.type != MSG_EXIT) - fatalx(unexpected message); + do { + if (privsep_recv(pio, msg, NULL) != 0) + fatalx(privsep_recv error); + } while (msg.type != MSG_EXIT); #ifdef DEBUG COUNTFDS(a-name); Index: child-fetch.c === RCS file: /cvsroot/fdm/fdm/child-fetch.c,v retrieving revision 1.72 diff -u -p -r1.72 child-fetch.c --- child-fetch.c 17 May 2009 18:23:45 - 1.72 +++ child-fetch.c 24 May 2009 14:56:51 - @@ -127,11 +127,11 @@ child_fetch(struct child *child, struct log_debug3(%s: sending exit message to parent, a-name); if (privsep_send(pio, msg, NULL) != 0) fatalx(privsep_send error); - log_debug3(%s: waiting for exit message from parent, a-name); - if (privsep_recv(pio, msg, NULL) != 0) - fatalx(privsep_recv error); - if (msg.type != MSG_EXIT) - fatalx(unexpected message); + do { + log_debug3(%s: waiting for exit message from parent, a-name); + if (privsep_recv(pio, msg, NULL) != 0) + fatalx(privsep_recv error); + } while (msg.type != MSG_EXIT); #ifdef DEBUG COUNTFDS(a-name); Index: child.c === RCS file: /cvsroot/fdm/fdm/child.c,v retrieving revision 1.147 diff -u -p -r1.147 child.c --- child.c 17 May 2009 19:20:08 - 1.147 +++ child.c 24 May 2009 14:56:51 - @@ -18,6 +18,7 @@ #include sys/types.h #include sys/socket.h +#include sys/wait.h #include unistd.h @@ -35,6 +36,9 @@ child_sighandler(int sig) case SIGUSR1: sigusr1 = 1; break; + case SIGCHLD: + waitpid(WAIT_ANY, NULL, WNOHANG); + break; case SIGTERM: cleanup_purge(); _exit(1); @@ -60,6 +64,7 @@ child_fork(void) sigaddset(act.sa_mask, SIGUSR1); sigaddset(act.sa_mask, SIGINT); sigaddset(act.sa_mask, SIGTERM); + sigaddset(act.sa_mask, SIGCHLD); act.sa_flags = SA_RESTART; act.sa_handler = SIG_IGN; @@ -75,6 +80,8 @@ child_fork(void) fatal(sigaction failed); if (sigaction(SIGTERM, act, NULL) 0) fatal(sigaction failed); + if (sigaction(SIGCHLD, act, NULL) 0) + fatal(sigaction failed); return (0); default: Index: command.c === RCS file: /cvsroot/fdm/fdm/command.c,v retrieving revision 1.54 diff -u -p -r1.54 command.c --- command.c 17 May 2009 18:23:45 - 1.54 +++ command.c 24 May 2009 14:56:51 - @@ -132,6 +132,8 @@ cmd_start(const char *s, int flags, cons fatal(signal failed); if (signal(SIGUSR2, SIG_DFL) == SIG_ERR) fatal(signal failed); +if (signal(SIGCHLD, SIG_DFL) == SIG_ERR) + fatal(signal failed); execl(_PATH_BSHELL, sh, -c, s, (char *) NULL); fatal(execl failed); @@ -271,10 +273,14 @@ cmd_poll(struct cmd *cmd, char
Bug#528818: fdm log for debian bug 528818
On Sunday 24 May 2009 20:31:18 Nicholas Marriott wrote: This diff makes several changes which hopefully should make it a bit more robust and correctly handle the various child processes on exit. Please test. Hi Nicholas, Can you please send it in patch format ? That way I can easily apply it on the Debian package and test it. Ritesh -- Ritesh Raj Sarraf RESEARCHUT - http://www.researchut.com Necessity is the mother of invention. signature.asc Description: This is a digitally signed message part.