-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi!

I recently upgraded one machine from Amanda v2.5.2p1 to
v2.6.1p2 and noticed that with the new version amandad
does not collect exit status from forked child processes
anymore, so all terminated child processes are left as zombies
until the main amandad process terminates.

This creates a situation with many defunct processes during
a backup run, like this:

r...@file:~ {530} $ ps awwx -o user,pid,ppid,start,stat,wchan,command | grep 
amanda
amanda   28390 28387 20:00:01 Ss   wait4  /bin/sh /usr/sbin/amdump workdays
amanda   28423 28390 20:00:02 S    wait4  /usr/bin/perl - 
/usr/libexec/amanda/planner /usr/libexec/amanda/driver workdays 20100617200002
amanda   28425 28423 20:00:02 S    poll   /usr/libexec/amanda/driver workdays
amanda   28426 28425 20:00:03 S    ?      taper workdays
amanda   28427 28425 20:00:03 S    poll   dumper0 workdays
amanda   28428 28425 20:00:03 R    -      dumper1 workdays
amanda   28429 28425 20:00:03 S    ?      dumper2 workdays
amanda   28430 28425 20:00:03 S    ?      dumper3 workdays
amanda   28431 28425 20:00:03 S    ?      dumper4 workdays
amanda   28432 28426 20:00:03 S    poll   taper workdays
amanda   28434  1120 20:00:03 S    poll   amandad -auth=bsd amdump amindexd 
amidxtaped
amanda   28436 28434 20:00:03 Z    exit   [amandad] <defunct>
amanda   28438 28434 20:00:04 Z    exit   [amandad] <defunct>
amanda   29575 28425 20:27:33 S    tcp_da chunker0 workdays
amanda   29576 28425 20:27:33 S    tcp_da chunker1 workdays
amanda   29578 28428 20:27:34 S    pipe_w /bin/gzip --best
amanda   29579 28434 20:27:34 S    pipe_w /usr/libexec/amanda/sendbackup 
amandad bsd
amanda   29580 28434 20:27:34 Z    exit   [amandad] <defunct>
amanda   29581 29579 20:27:34 R    -      /bin/gzip --fast
amanda   29582 29579 20:27:34 S    pipe_w /usr/libexec/amanda/sendbackup 
amandad bsd
root     29583 29579 20:27:34 S    pipe_w /bin/tar --create --file - 
--directory /home/gruppen/alle/Projekte --one-file-system --listed-incremental 
/var/db/amanda/gnutar-lists/file.local_home_gruppen_alle_Projekte_0.new 
--sparse --ignore-failed-read --totals --exclude-from 
/tmp/amanda/sendbackup._home_gruppen_alle_Projekte.20100617202734.exclude .
amanda   29584 29582 20:27:34 S    wait4  sh -c /bin/tar -tf - 2>/dev/null | 
sed -e 's/^\.//'
amanda   29585 29584 20:27:34 S    -      /bin/tar -tf -
amanda   29586 29584 20:27:34 S    pipe_w sed -e s/^\.//
amanda   29587 28427 20:27:34 S    pipe_w /bin/gzip --best
amanda   29588 29583 20:27:34 Z    exit   [sh] <defunct>
amanda   29602 28434 20:27:49 Z    exit   [amandad] <defunct>
amanda   29620 28434 20:28:03 Z    exit   [amandad] <defunct>
amanda   29636 28434 20:28:18 Z    exit   [amandad] <defunct>
amanda   29695 28434 20:30:18 Z    exit   [amandad] <defunct>
amanda   29845 28434 20:41:03 Z    exit   [amandad] <defunct>
amanda   29857 28434 20:41:19 Z    exit   [amandad] <defunct>
amanda   29871 28434 20:41:33 Z    exit   [amandad] <defunct>
amanda   29886 28434 20:41:48 Z    exit   [amandad] <defunct>
amanda   29902 28434 20:42:04 Z    exit   [amandad] <defunct>
amanda   29920 28434 20:42:19 Z    exit   [amandad] <defunct>
amanda   29936 28434 20:42:33 Z    exit   [amandad] <defunct>
amanda   29951 28434 20:42:48 Z    exit   [amandad] <defunct>
amanda   30071 28434 20:47:29 Z    exit   [amandad] <defunct>
amanda   30091 28434 20:47:52 Z    exit   [amandad] <defunct>
amanda   30108 28434 20:48:08 Z    exit   [amandad] <defunct>
amanda   30121 28434 20:48:22 Z    exit   [amandad] <defunct>
amanda   30136 28434 20:48:38 Z    exit   [amandad] <defunct>
amanda   30150 28434 20:48:53 Z    exit   [amandad] <defunct>
amanda   30164 28434 20:49:07 Z    exit   [amandad] <defunct>
amanda   30180 28434 20:49:31 Z    exit   [amandad] <defunct>
amanda   30221 28434 20:50:23 Z    exit   [amandad] <defunct>
amanda   30267 28434 20:51:10 Z    exit   [amandad] <defunct>
amanda   30284 28434 20:51:36 Z    exit   [amandad] <defunct>
amanda   30301 28434 20:52:02 Z    exit   [amandad] <defunct>
amanda   30319 28434 20:52:38 Z    exit   [amandad] <defunct>
amanda   30348 28434 20:53:18 Z    exit   [amandad] <defunct>
amanda   30374 28434 20:53:59 Z    exit   [amandad] <defunct>
amanda   30390 28434 20:54:14 Z    exit   [amandad] <defunct>
amanda   30406 28434 20:54:29 Z    exit   [amandad] <defunct>
amanda   30422 28434 20:54:43 Z    exit   [amandad] <defunct>
amanda   30442 28434 20:55:00 Z    exit   [amandad] <defunct>
amanda   30470 28434 20:55:15 Z    exit   [amandad] <defunct>
amanda   30486 28434 20:55:31 Z    exit   [amandad] <defunct>
amanda   30502 28434 20:55:45 Z    exit   [amandad] <defunct>
amanda   30533 28434 20:56:01 Z    exit   [amandad] <defunct>
amanda   30549 28434 20:56:16 Z    exit   [amandad] <defunct>
amanda   30562 28434 20:56:31 Z    exit   [amandad] <defunct>
amanda   30591 28434 20:56:46 Z    exit   [amandad] <defunct>
amanda   30607 28434 20:57:00 Z    exit   [amandad] <defunct>
amanda   30621 28434 20:57:15 Z    exit   [amandad] <defunct>
amanda   30651 28434 20:57:30 Z    exit   [amandad] <defunct>
amanda   30679 28434 20:57:45 Z    exit   [amandad] <defunct>
amanda   30694 28434 20:58:01 Z    exit   [amandad] <defunct>
amanda   30722 28434 20:58:15 Z    exit   [amandad] <defunct>
amanda   30737 28434 20:58:31 Z    exit   [amandad] <defunct>
amanda   30754 28434 20:58:45 Z    exit   [amandad] <defunct>
amanda   30769 28434 20:59:01 Z    exit   [amandad] <defunct>
amanda   30785 28434 20:59:15 Z    exit   [amandad] <defunct>
amanda   30800 28434 20:59:30 Z    exit   [amandad] <defunct>
amanda   30817 28434 20:59:46 Z    exit   [amandad] <defunct>
amanda   30837 28434 21:00:01 Z    exit   [amandad] <defunct>
amanda   30868 28434 21:00:15 Z    exit   [amandad] <defunct>
amanda   30896 28434 21:00:30 Z    exit   [amandad] <defunct>
amanda   30914 28434 21:00:46 Z    exit   [amandad] <defunct>
amanda   30929 28434 21:01:00 Z    exit   [amandad] <defunct>
amanda   30948 28434 21:01:15 Z    exit   [amandad] <defunct>
amanda   30964 28434 21:01:30 Z    exit   [amandad] <defunct>
amanda   30981 28434 21:01:46 Z    exit   [amandad] <defunct>
amanda   30996 28434 21:02:00 Z    exit   [amandad] <defunct>
amanda   31035 28434 21:02:15 Z    exit   [amandad] <defunct>
amanda   31051 28434 21:02:31 Z    exit   [amandad] <defunct>
amanda   31068 28434 21:02:46 Z    exit   [amandad] <defunct>
amanda   31083 28434 21:03:00 Z    exit   [amandad] <defunct>
amanda   31098 28434 21:03:15 Z    exit   [amandad] <defunct>
amanda   31114 28434 21:03:31 Z    exit   [amandad] <defunct>
amanda   31130 28434 21:03:45 Z    exit   [amandad] <defunct>
amanda   31145 28434 21:04:00 Z    exit   [amandad] <defunct>
amanda   31160 28434 21:04:16 Z    exit   [amandad] <defunct>
amanda   31176 28434 21:04:31 Z    exit   [amandad] <defunct>
amanda   31193 28434 21:04:45 Z    exit   [amandad] <defunct>


This is from a backup server with about 75 DLE's distributed
on 3 servers with a total of about 2TB of data. All servers run
Linux.

Please note, that the backup itself finishes successfully
and a restore of files from backup works fine. So it is not
a showstopper. But during backup time Nagios reports those
zombie processes and this fills our mailbox, so I really
would like to solve it.

I noticed the following changes between 2.5.2p1 and 2.6.1p2 which
might be related to the problem:

- --- amandad.c   2 May 2007 11:54:59 -0000       1.1.1.2
+++ amandad.c   24 Jul 2009 18:42:48 -0000      1.1.1.4

@@ -162,40 +175,21 @@
 static action_t s_ackwait(struct active_service *, action_t, pkt_t *);

 static void repfd_recv(void *);
+static void errfd_recv(void *);
 static void timeout_repfd(void *);
 static void protocol_recv(void *, pkt_t *, security_status_t);
 static void process_readnetfd(void *);
 static void process_writenetfd(void *, void *, ssize_t);
 static struct active_service *service_new(security_handle_t *,
- -    const char *, const char *);
+    const char *, service_t, const char *);
 static void service_delete(struct active_service *);
 static int writebuf(struct active_service *, const void *, size_t);
 static ssize_t do_sendpkt(security_handle_t *handle, pkt_t *pkt);
- -
- -static void child_signal(int signal);
+static char *amandad_get_security_conf (char *, void *);

 static const char *state2str(state_t);
 static const char *action2str(action_t);

- -/*
- - * Harvests defunct processes...
- - */
- -
- -static void
- -child_signal(
- -    int                signal)
- -{
- -    pid_t      rp;
- -
- -    (void)signal;      /* Quite compiler warning */
- -    /*
- -     * Reap and child status and promptly ignore since we don't care...
- -     */
- -    do {
- -       rp = waitpid(-1, NULL, WNOHANG);
- -    } while (rp > 0);
- -}
- -
 int
 main(
     int                argc,


and

@@ -232,49 +233,28 @@
     dbopen(DBG_SUBDIR_AMANDAD);

     if(argv == NULL) {
- -       error("argv == NULL\n");
+       error(_("argv == NULL\n"));
        /*NOTREACHED*/
     }

     /* Don't die when child closes pipe */
     signal(SIGPIPE, SIG_IGN);

- -    /* Tell me when a child exits or dies... */
- -    act.sa_handler = child_signal;
- -    sigemptyset(&act.sa_mask);
- -    act.sa_flags = 0;
- -    if(sigaction(SIGCHLD, &act, &oact) != 0) {
- -       error("error setting SIGCHLD handler: %s", strerror(errno));
- -       /*NOTREACHED*/
- -    }
+    /* Parse the configuration; we'll handle errors later */
+    config_init(CONFIG_INIT_CLIENT, NULL);

- -    conffile = vstralloc(CONFIG_DIR, "/", "amanda-client.conf", NULL);
- -    if (read_clientconf(conffile) > 0) {
- -       error("error reading conffile: %s", conffile);
- -       /*NOTREACHED*/
+    if (geteuid() == 0) {
+       check_running_as(RUNNING_AS_ROOT);
+       initgroups(CLIENT_LOGIN, get_client_gid());
+       setgid(get_client_gid());
+       setegid(get_client_gid());
+       seteuid(get_client_uid());
+    } else {
+       check_running_as(RUNNING_AS_CLIENT_LOGIN);
     }
- -    amfree(conffile);
- -
- -#ifdef USE_DBMALLOC
- -    dbmalloc_info.start.size = malloc_inuse(&dbmalloc_info.start.hist);
- -#endif

     erroutput_type = (ERR_INTERACTIVE|ERR_SYSLOG);

- -#ifdef FORCE_USERID
- -    /* we'd rather not run as root */
- -    if (geteuid() == 0) {
- -       if(client_uid == (uid_t) -1) {
- -           error("error [cannot find user %s in passwd file]\n", 
CLIENT_LOGIN);
- -           /*NOTREACHED*/
- -       }
- -       initgroups(CLIENT_LOGIN, client_gid);
- -       setgid(client_gid);
- -       setegid(client_gid);
- -       seteuid(client_uid);
- -    }
- -#endif /* FORCE_USERID */
- -
     /*
      * ad-hoc argument parsing
      *

Am I right with my diagnosis?
Why was the SIGCHLD handler removed in Amanda 2.6?

Is this a known problem?
Am I the only one seeing it (I found some older similar reports,
but no recent ones)?

- - andreas

- --
Andreas Haumer                     | mailto:[email protected]
*x Software + Systeme              | http://www.xss.co.at/
Karmarschgasse 51/2/20             | Tel: +43-1-6060114-0
A-1100 Vienna, Austria             | Fax: +43-1-6060114-71
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.10 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iD8DBQFMG1vsxJmyeGcXPhERAsP+AJ4qeUQ1TDMvZ4QgqY6J9uw82+3lTQCePGVB
Om7VkKTg+JOW3Dy0TwvnlNM=
=gZ8P
-----END PGP SIGNATURE-----

Reply via email to