I think I have a fix for the MAXFD issue. Or at least one cause of it. What appears to be happening is:

cfexecd calls MailResult() to decide whether to send an email after fork()ing cfagent. Control reaches FileChecksum() at line 714 of cfexecd.c.

-----BEGIN SNIPPET-----
  switch (type)
     {
     case 's': md = EVP_get_digestbyname("sha");
          break;
     case 'm': md = EVP_get_digestbyname("md5");
          break;
     default: FatalError("Software failure in ChecksumFile");
     }

  if (!md)
     {
     return 0;
     }
-----END SNIPPET-----

Under the debugger I saw EVP_get_digestbyname() return NULL and ChecksumFile() return 0. Quite why this is I don't know. The upshot is that the filehandle opened to perform the digest calculation is not closed.

This results in file descriptor leakage. With lsof I could see cfexecd holding open these log files until eventually (after sixteen or so scheduled cfagent runs) the number of open filedescriptors exceeded MAXFD (20).

On the next run LocalExec() calls cfpopen() to spawn cfagent. cfpopen() fork()s a child, which exec()s cfagent. The parent then reaches line 191 of popen.c.

-----BEGIN SNIPPET-----
   if (fileno(pp) >= MAXFD)
      {
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than MAXFD, check for defunct children", fileno(pp), pid);
      CfLog(cferror,OUTPUT,"");
      return NULL;
      }
   else
      {
      CHILD[fileno(pp)] = pid;
      return pp;
      }
-----END SNIPPET-----

cfopen() has returned a new file pointer to LocalExec(), which handles this gracefully, aborting the LocalExec() run. But it therefore never gets to cfpclose() and hence doesn't wait() for the cfagent child, which zombies.

The attached patch fixes the problem by being more careful when MAXFD is exceeded and also by eliminating the particular cause of leakage that I saw.

Firstly FileChecksum() closes the new filehandle before returning 0 in the case where EVP_get_digestbyname() fails.

In popen.c I've moved the wait()/waitpid() code to a new function, cfpwait(pid_t) and called it not only from the original location in cfpclose() but also before returning NULL from cfpopen*() if the fork() has already completed.

Even if MAXFD did still end up being exceeded, for example because there is another cause of leakage I didn't see, cfpclose() will now wait for "pid" -1, ie any child, and so not leave a zombie. It can't look up the specific pid from CHILD[fd] as fd is necessarily outside the array bounds.

However, there may be more to this. A cursory grep of the source tree shows that the return value of cfpclose() is thrown away everywhere except for within package.c. While moving stuff into cfpwait() I noticed something a bit odd. The code looks like this:

-----BEGIN SNIPPET-----
#ifdef HAVE_WAITPID

while (waitpid(pid, &status, 0) < 0
...
return status;

#else

while ((wait_result = wait(&status)) != pid)
...
if (WIFSIGNALED(status))
...
if (! WIFEXITED(status))
...
return (WEXITSTATUS(status));
#endif
-----END SNIPPET-----

I suspect the #endif should come above the WIFSIGNALED() conditional. I haven't moved it, however, as the return code is checked in, eg FreeBSDPackageList(). Nonetheless it may be, given that people were seeing funky behaviour with RPM packages, that the return value of cfpopen() was causing trouble.
--- cfengine-2.2.7/src/cfexecd.c.orig   2008-07-14 17:08:01.997279000 +0100
+++ cfengine-2.2.7/src/cfexecd.c        2008-07-15 10:17:02.008138000 +0100
@@ -722,6 +722,8 @@
 
    if (!md)
       {
+      /* Don't leak file descriptors. */
+      fclose(file);
       return 0;
       }
    
--- cfengine-2.2.7/src/popen.c.orig     2008-06-10 18:40:26.000000000 +0100
+++ cfengine-2.2.7/src/popen.c  2008-07-15 14:10:34.550638000 +0100
@@ -174,6 +174,8 @@
           
           if ((pp = fdopen(pd[0],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
           break;
@@ -184,6 +186,8 @@
           
           if ((pp = fdopen(pd[1],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
       }
@@ -192,6 +196,8 @@
       {
       snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than 
MAXFD, check for defunct children", fileno(pp), pid);
       CfLog(cferror,OUTPUT,"");
+      /* Don't leave zombies. */
+      cfpwait(pid);
       return NULL;
       }
    else
@@ -340,6 +346,8 @@
           
           if ((pp = fdopen(pd[0],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
           break;
@@ -350,6 +358,8 @@
           
           if ((pp = fdopen(pd[1],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
       }
@@ -358,6 +368,8 @@
       {
       snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than 
MAXFD, check for defunct children", fileno(pp), pid);
       CfLog(cferror,OUTPUT,"");
+      /* Don't leave zombies. */
+      cfpwait(pid);
       return NULL;
       }
    else
@@ -456,6 +468,8 @@
           
           if ((pp = fdopen(pd[0],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
           break;
@@ -466,6 +480,8 @@
           
           if ((pp = fdopen(pd[1],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
       }
@@ -474,6 +490,8 @@
       {
       snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than 
MAXFD, check for defunct children", fileno(pp), pid);
       CfLog(cferror,OUTPUT,"");
+      /* Don't leave zombies. */
+      cfpwait(pid);
       return NULL;
       }
    else
@@ -596,6 +614,8 @@
           
           if ((pp = fdopen(pd[0],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
           break;
@@ -606,6 +626,8 @@
           
           if ((pp = fdopen(pd[1],type)) == NULL)
              {
+             /* Don't leave zombies. */
+             cfpwait(pid);
              return NULL;
              }
       }
@@ -614,6 +636,8 @@
       {
       snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than 
MAXFD, check for defunct children", fileno(pp), pid);
       CfLog(cferror,OUTPUT,"");
+      /* Don't leave zombies. */
+      cfpwait(pid);
       return NULL;
       }
    else
@@ -631,43 +655,11 @@
 /* Close commands                                                             
*/
 
/******************************************************************************/
 
-int cfpclose(FILE *pp)
+int cfpwait(pid_t *pid)
 
-{ int fd, status, wait_result;
-  pid_t pid;
+{ int status, wait_result;
 
-Debug("cfpclose(pp)\n");
-
-if (CHILD == NULL)  /* popen hasn't been called */
-   {
-   return -1;
-   }
-
-ALARM_PID = -1;
-fd = fileno(pp);
-
-if (fd >= MAXFD)
-   {
-   snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child higher than "
-      "MAXFD, check for defunct children", fd);
-   CfLog(cferror,OUTPUT,"");
-   fclose(pp);
-   return -1;
-   }
-
-if ((pid = CHILD[fd]) == 0)
-   {
-   return -1;
-   }
-
-CHILD[fd] = 0;
-
-if (fclose(pp) == EOF)
-   {
-   return -1;
-   }
-
-Debug("cfpopen - Waiting for process %d\n",pid); 
+Debug("cfpwait - Waiting for process %d\n",pid); 
 
 #ifdef HAVE_WAITPID
 
@@ -707,6 +699,48 @@
 #endif
 }
 
+int cfpclose(FILE *pp)
+
+{ int fd;
+  pid_t pid;
+
+Debug("cfpclose(pp)\n");
+
+if (CHILD == NULL)  /* popen hasn't been called */
+   {
+   return -1;
+   }
+
+ALARM_PID = -1;
+fd = fileno(pp);
+
+if (fd >= MAXFD)
+   {
+   snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child higher than "
+      "MAXFD, check for defunct children", fd);
+   CfLog(cferror,OUTPUT,"");
+   /* We can't wait for the specific pid as it isn't stored in CHILD. */
+   cfpwait(-1);
+   return -1;
+   }
+else
+   {
+   if ((pid = CHILD[fd]) == 0)
+      {
+      return -1;
+      }
+
+   CHILD[fd] = 0;
+   }
+
+if (fclose(pp) == EOF)
+   {
+   return -1;
+   }
+
+return cfpwait(pid);
+}
+
 /*******************************************************************/
 
 int CfSetUid(uid_t uid,gid_t gid)
--- cfengine-2.2.7/src/prototypes.h.orig        2008-07-15 09:51:45.676513000 
+0100
+++ cfengine-2.2.7/src/prototypes.h     2008-07-15 09:50:26.306308000 +0100
@@ -918,6 +918,7 @@
 FILE *cfpopen (char *command, char *type);
 FILE *cfpopen_sh (char *command, char *type);
 FILE *cfpopen_shsetuid (char *command, char *type, uid_t uid, gid_t gid, char 
*chdirv, char *chrootv);
+int cfpwait (pid_t *pid);
 int cfpclose (FILE *pp);
 int cfpclose_def (FILE *pp, char *defines, char *elsedef);
 int CfSetUid(uid_t uid,gid_t gid);
_______________________________________________
Bug-cfengine mailing list
[email protected]
https://cfengine.org/mailman/listinfo/bug-cfengine

Reply via email to