Bug#528818: fdm log for debian bug 528818

2009-05-26 Thread Nicholas Marriott
Okay, I have committed some parts of the diff and made a couple of other tweaks
which were sometimes causing it to hang. Please test the diff below instead.


Index: fdm.c
===
RCS file: /cvsroot/fdm/fdm/fdm.c,v
retrieving revision 1.183
diff -u -p -r1.183 fdm.c
--- fdm.c   25 May 2009 21:47:23 -  1.183
+++ fdm.c   26 May 2009 06:08:03 -
@@ -44,9 +44,11 @@ const char   *malloc_options = AFGJPRX;
 
 voidsighandler(int);
 struct child   *check_children(struct children *, u_int *);
+int wait_children(struct children *, struct children *);
 
 struct conf conf;
 
+volatile sig_atomic_t   sigchld;
 volatile sig_atomic_t   sigusr1;
 volatile sig_atomic_t   sigint;
 volatile sig_atomic_t   sigterm;
@@ -67,6 +69,9 @@ sighandler(int sig)
case SIGTERM:
sigterm = 1;
break;
+   case SIGCHLD:
+   sigchld = 1;
+   break;
}
 }
 
@@ -181,6 +186,79 @@ check_children(struct children *children
return (NULL);
 }
 
+/* Wait for a child and deal with its exit. */
+int
+wait_children(struct children *children, struct children *dead_children)
+{
+   struct child*child, *child2;
+   pid_tpid;
+   int  status, retcode = 0;
+   u_inti, j;
+
+   for (;;) {
+   log_debug3(parent: waiting for children);
+   /* Wait for a child. */
+   switch (pid = waitpid(WAIT_ANY, status, WNOHANG)) {
+   case 0:
+   return (0);
+   case -1:
+   if (errno == ECHILD)
+   return (0);
+   fatal(waitpid failed);
+   }
+
+   /* Handle the exit status. */
+   if (WIFSIGNALED(status)) {
+   retcode = 1;
+   log_debug2(parent: child %ld got signal %d,
+   (long) pid, WTERMSIG(status));
+   } else if (!WIFEXITED(status)) {
+   retcode = 1;
+   log_debug2(parent: child %ld exited badly,
+   (long) pid);
+   } else {
+   if (WEXITSTATUS(status) != 0)
+   retcode = 1;
+   log_debug2(parent: child %ld returned %d,
+   (long) pid, WEXITSTATUS(status));
+   }
+   
+   /* Find this child. */
+   child = NULL;
+   for (i = 0; i  ARRAY_LENGTH(children); i++) {
+   child = ARRAY_ITEM(children, i);
+   if (pid == child-pid)
+   break;
+   }
+   if (i == ARRAY_LENGTH(children)) {
+   log_debug2(parent: unidentified child %ld,
+   (long) pid);
+   continue;
+   }
+
+   if (child-io != NULL) {
+   io_close(child-io);
+   io_free(child-io);
+   child-io = NULL;
+   }
+   ARRAY_REMOVE(children, i);
+   ARRAY_ADD(dead_children, child);
+   
+   /* If this child was the parent of any others, kill them too. */
+   for (j = 0; j  ARRAY_LENGTH(children); j++) {
+   child2 = ARRAY_ITEM(children, j);
+   if (child2-parent != child)
+   continue;
+   
+   log_debug2(parent: child %ld died: killing %ld,
+   (long) child-pid, (long) child2-pid);
+   kill(child2-pid, SIGTERM);
+   }
+   }
+
+   return (retcode);
+}
+
 __dead void
 usage(void)
 {
@@ -198,7 +276,6 @@ main(int argc, char **argv)
enum fdmop   op = FDMOP_NONE;
const char  *proxy = NULL, *s;
char tmp[BUFSIZ], *ptr, *lock = NULL, *user, *home = NULL;
-   long n;
struct utsname   un;
struct passwd   *pw;
struct stat  sb;
@@ -207,8 +284,8 @@ main(int argc, char **argv)
TAILQ_HEAD(, account) actaq; /* active accounts */
pid_tpid;
struct children  children, dead_children;
-   struct child*child, *child2;
-   struct io   *rio;
+   struct child*child;
+   struct io   *dead_io;
struct iolistiol;
double   tim;
struct sigaction act;
@@ -218,6 +295,7 @@ main(int argc, char **argv)
struct strings   macros;
struct child_fetch_data *cfd;
struct userdata *ud;
+   sigset_t sigset;
 #ifdef DEBUG
struct rule *r;
struct action   *t;
@@ -676,21 +754,29 @@ 

Bug#528818: fdm log for debian bug 528818

2009-05-26 Thread Nicholas Marriott
On Tue, May 26, 2009 at 07:10:05AM +0100, Nicholas Marriott wrote:
 Okay, I have committed some parts of the diff and made a couple of other 
 tweaks
 which were sometimes causing it to hang. Please test the diff below instead.

This is better, please try this:

Index: fdm.c
===
RCS file: /cvsroot/fdm/fdm/fdm.c,v
retrieving revision 1.183
diff -u -p -r1.183 fdm.c
--- fdm.c   25 May 2009 21:47:23 -  1.183
+++ fdm.c   26 May 2009 06:23:30 -
@@ -44,9 +44,12 @@ const char   *malloc_options = AFGJPRX;
 
 voidsighandler(int);
 struct child   *check_children(struct children *, u_int *);
+int wait_children(
+struct children *, struct children *, int);
 
 struct conf conf;
 
+volatile sig_atomic_t   sigchld;
 volatile sig_atomic_t   sigusr1;
 volatile sig_atomic_t   sigint;
 volatile sig_atomic_t   sigterm;
@@ -67,6 +70,9 @@ sighandler(int sig)
case SIGTERM:
sigterm = 1;
break;
+   case SIGCHLD:
+   sigchld = 1;
+   break;
}
 }
 
@@ -181,6 +187,81 @@ check_children(struct children *children
return (NULL);
 }
 
+/* Wait for a child and deal with its exit. */
+int
+wait_children(
+struct children *children, struct children *dead_children, int no_hang)
+{
+   struct child*child, *child2;
+   pid_tpid;
+   int  status, flags, retcode = 0;
+   u_inti, j;
+
+   flags = no_hang ? WNOHANG : 0;
+   for (;;) {
+   log_debug3(parent: waiting for children);
+   /* Wait for a child. */
+   switch (pid = waitpid(WAIT_ANY, status, flags)) {
+   case 0:
+   return (0);
+   case -1:
+   if (errno == ECHILD)
+   return (0);
+   fatal(waitpid failed);
+   }
+
+   /* Handle the exit status. */
+   if (WIFSIGNALED(status)) {
+   retcode = 1;
+   log_debug2(parent: child %ld got signal %d,
+   (long) pid, WTERMSIG(status));
+   } else if (!WIFEXITED(status)) {
+   retcode = 1;
+   log_debug2(parent: child %ld exited badly,
+   (long) pid);
+   } else {
+   if (WEXITSTATUS(status) != 0)
+   retcode = 1;
+   log_debug2(parent: child %ld returned %d,
+   (long) pid, WEXITSTATUS(status));
+   }
+   
+   /* Find this child. */
+   child = NULL;
+   for (i = 0; i  ARRAY_LENGTH(children); i++) {
+   child = ARRAY_ITEM(children, i);
+   if (pid == child-pid)
+   break;
+   }
+   if (i == ARRAY_LENGTH(children)) {
+   log_debug2(parent: unidentified child %ld,
+   (long) pid);
+   continue;
+   }
+
+   if (child-io != NULL) {
+   io_close(child-io);
+   io_free(child-io);
+   child-io = NULL;
+   }
+   ARRAY_REMOVE(children, i);
+   ARRAY_ADD(dead_children, child);
+   
+   /* If this child was the parent of any others, kill them too. */
+   for (j = 0; j  ARRAY_LENGTH(children); j++) {
+   child2 = ARRAY_ITEM(children, j);
+   if (child2-parent != child)
+   continue;
+   
+   log_debug2(parent: child %ld died: killing %ld,
+   (long) child-pid, (long) child2-pid);
+   kill(child2-pid, SIGTERM);
+   }
+   }
+
+   return (retcode);
+}
+
 __dead void
 usage(void)
 {
@@ -198,7 +279,6 @@ main(int argc, char **argv)
enum fdmop   op = FDMOP_NONE;
const char  *proxy = NULL, *s;
char tmp[BUFSIZ], *ptr, *lock = NULL, *user, *home = NULL;
-   long n;
struct utsname   un;
struct passwd   *pw;
struct stat  sb;
@@ -207,8 +287,8 @@ main(int argc, char **argv)
TAILQ_HEAD(, account) actaq; /* active accounts */
pid_tpid;
struct children  children, dead_children;
-   struct child*child, *child2;
-   struct io   *rio;
+   struct child*child;
+   struct io   *dead_io;
struct iolistiol;
double   tim;
struct sigaction act;
@@ -676,21 +756,29 @@ main(int argc, char **argv)
  

Bug#528818: fdm log for debian bug 528818

2009-05-26 Thread Ritesh Raj Sarraf
On Tuesday 26 May 2009 11:54:26 Nicholas Marriott wrote:
  Okay, I have committed some parts of the diff and made a couple of other
  tweaks which were sometimes causing it to hang. Please test the diff
  below instead.

 This is better, please try this:

I've patched and built the test package. It is currently running. Will update 
your the results once it triggers.


PS: I removed the cat related rules from my fdm.conf file.


Ritesh
-- 
Ritesh Raj Sarraf
RESEARCHUT - http://www.researchut.com
Necessity is the mother of invention.



signature.asc
Description: This is a digitally signed message part.


Bug#528818: fdm log for debian bug 528818

2009-05-25 Thread Ritesh Raj Sarraf
On Monday 25 May 2009 11:16:26 Ritesh Raj Sarraf wrote:
 On Sunday 24 May 2009 20:31:18 Nicholas Marriott wrote:
  This diff makes several changes which hopefully should make it a bit more
  robust and correctly handle the various child processes on exit.
 
  Please test.

 Hi Nicholas,

 Can you please send it in patch format ?
 That way I can easily apply it on the Debian package and test it.


I copy/pasted your diff and it fails to apply. Looks like the CVS version is 
too different from the latest one in Debian.
Will try to build the cvs version.

Ritesh
-- 
Ritesh Raj Sarraf
RESEARCHUT - http://www.researchut.com
Necessity is the mother of invention.



signature.asc
Description: This is a digitally signed message part.


Bug#528818: fdm log for debian bug 528818

2009-05-25 Thread Ritesh Raj Sarraf
Hi Nicholas,

On Monday 25 May 2009 13:20:40 Ritesh Raj Sarraf wrote:
 I copy/pasted your diff and it fails to apply. Looks like the CVS version
 is too different from the latest one in Debian.
 Will try to build the cvs version.

I'm re-running fdm with the patch you provided. Will provide you the logs once 
it is triggered again.

Thanks,
Ritesh
-- 
Ritesh Raj Sarraf
RESEARCHUT - http://www.researchut.com
Necessity is the mother of invention.



signature.asc
Description: This is a digitally signed message part.


Bug#528818: fdm log for debian bug 528818

2009-05-25 Thread Ritesh Raj Sarraf
Hi Nicholas,

On Monday 25 May 2009 14:07:47 Ritesh Raj Sarraf wrote:
 On Monday 25 May 2009 13:20:40 Ritesh Raj Sarraf wrote:
  I copy/pasted your diff and it fails to apply. Looks like the CVS version
  is too different from the latest one in Debian.
  Will try to build the cvs version.

 I'm re-running fdm with the patch you provided. Will provide you the logs
 once it is triggered again.

Looks like the bug is found. But the stale lock was not present. fdm is dying 
while fetching mails. Looks like a child process creation problem.


gmail-researchut: cat: out:  =  
 
gmail-researchut: cat: out: 
 
gmail-researchut: cat: out:  if [ $1 =3D status ] ; then
 
gmail-researchut: cat: out: # Display a status report.  
 
gmail-researchut: cat: out: -   log MSG Mounts: 
 
gmail-researchut: cat: out: +   log STATUS Mounts:  
 
gmail-researchut: cat: out: mount | sed s/^/   /  
 
gmail-researchut: cat: out: -log MSG
 
gmail-researchut: cat: out: -   log MSG Drive power status: 
 
gmail-researchut: cat: out: +log STATUS 
 
gmail-researchut: cat: out: +   log STATUS Drive power status:  
 
gmail-researchut: cat: out: for disk in $HD; do 
 
gmail-researchut: cat: out: if [ -r $disk ]; then   
 
gmail-researchut: cat: out: hdparm -C $disk 2/dev/null | 
sed s/^/   /  

gmail-researchut: cat: out: else
 
gmail-researchut: cat: out: -   log MSGCannot read 
$disk, permission denied - $0 needs to be run=  

gmail-researchut: cat: out:  as root   
 
gmail-researchut: cat: out: +   log STATUSCannot read 
$disk, permission denied - $0 needs to be = 
 
gmail-researchut: cat: out: run as root
 
gmail-researchut: cat: out: fi  
 
gmail-researchut: cat: out: done
 
gmail-researchut: cat: out: -log MSG
 
gmail-researchut: cat: out: -   log MSG (NOTE: drive settings affected by 
Laptop Mode cannot be retrie=   
 
gmail-researchut: cat: out: ved.)  
 
gmail-researchut: cat: out: +log STATUS 
 
gmail-researchut: cat: out: +   log STATUS (NOTE: drive settings affected by 
Laptop Mode cannot be ret=  
  
gmail-researchut: cat: out: rieved.)   
 
gmail-researchut: cat: out:  =  
 
gmail-researchut: cat: out: 
 
gmail-researchut: cat: out: -log MSG
 
gmail-researchut: cat: out: -   log MSG Readahead states:   
 
gmail-researchut: cat: out: +log STATUS 
 
gmail-researchut: cat: out: +   log STATUS Readahead states:
 
gmail-researchut: cat: out: cat /etc/mtab | while read DEV MP FST OPTS 
DUMP PASS ; do  
   
gmail-researchut: cat: out: # skip funny stuff  
 
gmail-researchut: cat: out: case $FST in =
 
gmail-researchut: cat: out: 
 
gmail-researchut: cat: out: @@ -381,29 +381,29 @@   
 
gmail-researchut: cat: out: esac
 
gmail-researchut: cat: out: if [ -b $DEV ] ; then   
 
gmail-researchut: cat: out: if [ -r $DEV ] ; then   
 
gmail-researchut: cat: out: -   log MSG$DEV: 
$((`blockdev --getra $DEV` / 2)) kB
  
gmail-researchut: cat: out: +   log STATUS$DEV: 
$((`blockdev --getra $DEV` / 2)) kB
   
gmail-researchut: cat: out: else
 
gmail-researchut: 

Bug#528818: fdm log for debian bug 528818

2009-05-24 Thread Nicholas Marriott
Okay, this is the problem:

gmail-researchut: spamprobe receive: io: poll: Connection timed out

Your pipe process does not return in the timeout, so fdm aborts the
fetch. Unfortunately this then causes various things which sometimes end up
with fdm not exiting correctly.

fdm's management of child processes isn't always very good, particularly when
an error occurs - sometimes stuff doesn't die in the right order and it ends up
with fatal errors. The whole multiprocess idea is to allow fdm to run lots of
things in parallel and to drop privileges when running as root, but I'm not
sure it is worth it now, I should really simplify it a lot and probably forget
about using it as root.

You can see similar problems with a configuration file such as:

set timeout 2
account 'stdin' stdin
match exec 'sleep 10' returns (0, ) action drop

And then eg echo|fdm - f

This diff makes several changes which hopefully should make it a bit more
robust and correctly handle the various child processes on exit.

Please test.

Index: child-deliver.c
===
RCS file: /cvsroot/fdm/fdm/child-deliver.c,v
retrieving revision 1.21
diff -u -p -r1.21 child-deliver.c
--- child-deliver.c 17 May 2009 19:20:08 -  1.21
+++ child-deliver.c 24 May 2009 14:56:51 -
@@ -61,10 +61,10 @@ child_deliver(struct child *child, struc
 
if (privsep_send(pio, msg, msgbuf) != 0)
fatalx(privsep_send error);
-   if (privsep_recv(pio, msg, NULL) != 0)
-   fatalx(privsep_recv error);
-   if (msg.type != MSG_EXIT)
-   fatalx(unexpected message);
+   do {
+   if (privsep_recv(pio, msg, NULL) != 0)
+   fatalx(privsep_recv error);
+   } while (msg.type != MSG_EXIT);
 
 #ifdef DEBUG
COUNTFDS(a-name);
Index: child-fetch.c
===
RCS file: /cvsroot/fdm/fdm/child-fetch.c,v
retrieving revision 1.72
diff -u -p -r1.72 child-fetch.c
--- child-fetch.c   17 May 2009 18:23:45 -  1.72
+++ child-fetch.c   24 May 2009 14:56:51 -
@@ -127,11 +127,11 @@ child_fetch(struct child *child, struct 
log_debug3(%s: sending exit message to parent, a-name);
if (privsep_send(pio, msg, NULL) != 0)
fatalx(privsep_send error);
-   log_debug3(%s: waiting for exit message from parent, a-name);
-   if (privsep_recv(pio, msg, NULL) != 0)
-   fatalx(privsep_recv error);
-   if (msg.type != MSG_EXIT)
-   fatalx(unexpected message);
+   do {
+   log_debug3(%s: waiting for exit message from parent, a-name);
+   if (privsep_recv(pio, msg, NULL) != 0)
+   fatalx(privsep_recv error);   
+   } while (msg.type != MSG_EXIT);
 
 #ifdef DEBUG
COUNTFDS(a-name);
Index: child.c
===
RCS file: /cvsroot/fdm/fdm/child.c,v
retrieving revision 1.147
diff -u -p -r1.147 child.c
--- child.c 17 May 2009 19:20:08 -  1.147
+++ child.c 24 May 2009 14:56:51 -
@@ -18,6 +18,7 @@
 
 #include sys/types.h
 #include sys/socket.h
+#include sys/wait.h
 
 #include unistd.h
 
@@ -35,6 +36,9 @@ child_sighandler(int sig)
case SIGUSR1:
sigusr1 = 1;
break;
+   case SIGCHLD:
+   waitpid(WAIT_ANY, NULL, WNOHANG);
+   break;
case SIGTERM:
cleanup_purge();
_exit(1);
@@ -60,6 +64,7 @@ child_fork(void)
sigaddset(act.sa_mask, SIGUSR1);
sigaddset(act.sa_mask, SIGINT);
sigaddset(act.sa_mask, SIGTERM);
+   sigaddset(act.sa_mask, SIGCHLD);
act.sa_flags = SA_RESTART;
 
act.sa_handler = SIG_IGN;
@@ -75,6 +80,8 @@ child_fork(void)
fatal(sigaction failed);
if (sigaction(SIGTERM, act, NULL)  0)
fatal(sigaction failed);
+   if (sigaction(SIGCHLD, act, NULL)  0)
+   fatal(sigaction failed);
 
return (0);
default:
Index: command.c
===
RCS file: /cvsroot/fdm/fdm/command.c,v
retrieving revision 1.54
diff -u -p -r1.54 command.c
--- command.c   17 May 2009 18:23:45 -  1.54
+++ command.c   24 May 2009 14:56:51 -
@@ -132,6 +132,8 @@ cmd_start(const char *s, int flags, cons
fatal(signal failed);
 if (signal(SIGUSR2, SIG_DFL) == SIG_ERR)
fatal(signal failed);
+if (signal(SIGCHLD, SIG_DFL) == SIG_ERR)
+   fatal(signal failed);
 
execl(_PATH_BSHELL, sh, -c, s, (char *) NULL);
fatal(execl failed);
@@ -271,10 +273,14 @@ cmd_poll(struct cmd *cmd, char 

Bug#528818: fdm log for debian bug 528818

2009-05-24 Thread Ritesh Raj Sarraf
On Sunday 24 May 2009 20:31:18 Nicholas Marriott wrote:
 This diff makes several changes which hopefully should make it a bit more
 robust and correctly handle the various child processes on exit.

 Please test.

Hi Nicholas,

Can you please send it in patch format ?
That way I can easily apply it on the Debian package and test it.

Ritesh
-- 
Ritesh Raj Sarraf
RESEARCHUT - http://www.researchut.com
Necessity is the mother of invention.



signature.asc
Description: This is a digitally signed message part.