I think I have a fix for the MAXFD issue. Or at least one cause of
it. What appears to be happening is:
cfexecd calls MailResult() to decide whether to send an email after
fork()ing cfagent. Control reaches FileChecksum() at line 714 of
cfexecd.c.
-----BEGIN SNIPPET-----
switch (type)
{
case 's': md = EVP_get_digestbyname("sha");
break;
case 'm': md = EVP_get_digestbyname("md5");
break;
default: FatalError("Software failure in ChecksumFile");
}
if (!md)
{
return 0;
}
-----END SNIPPET-----
Under the debugger I saw EVP_get_digestbyname() return NULL and
ChecksumFile() return 0. Quite why this is I don't know. The upshot
is that the filehandle opened to perform the digest calculation is not
closed.
This results in file descriptor leakage. With lsof I could see
cfexecd holding open these log files until eventually (after sixteen
or so scheduled cfagent runs) the number of open filedescriptors
exceeded MAXFD (20).
On the next run LocalExec() calls cfpopen() to spawn cfagent.
cfpopen() fork()s a child, which exec()s cfagent. The parent then
reaches line 191 of popen.c.
-----BEGIN SNIPPET-----
if (fileno(pp) >= MAXFD)
{
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d
higher than MAXFD, check for defunct children", fileno(pp), pid);
CfLog(cferror,OUTPUT,"");
return NULL;
}
else
{
CHILD[fileno(pp)] = pid;
return pp;
}
-----END SNIPPET-----
cfopen() has returned a new file pointer to LocalExec(), which
handles this gracefully, aborting the LocalExec() run. But it
therefore never gets to cfpclose() and hence doesn't wait() for the
cfagent child, which zombies.
The attached patch fixes the problem by being more careful when
MAXFD is exceeded and also by eliminating the particular cause of
leakage that I saw.
Firstly FileChecksum() closes the new filehandle before returning 0
in the case where EVP_get_digestbyname() fails.
In popen.c I've moved the wait()/waitpid() code to a new function,
cfpwait(pid_t) and called it not only from the original location in
cfpclose() but also before returning NULL from cfpopen*() if the
fork() has already completed.
Even if MAXFD did still end up being exceeded, for example because
there is another cause of leakage I didn't see, cfpclose() will now
wait for "pid" -1, ie any child, and so not leave a zombie. It can't
look up the specific pid from CHILD[fd] as fd is necessarily outside
the array bounds.
However, there may be more to this. A cursory grep of the source
tree shows that the return value of cfpclose() is thrown away
everywhere except for within package.c. While moving stuff into
cfpwait() I noticed something a bit odd. The code looks like this:
-----BEGIN SNIPPET-----
#ifdef HAVE_WAITPID
while (waitpid(pid, &status, 0) < 0
...
return status;
#else
while ((wait_result = wait(&status)) != pid)
...
if (WIFSIGNALED(status))
...
if (! WIFEXITED(status))
...
return (WEXITSTATUS(status));
#endif
-----END SNIPPET-----
I suspect the #endif should come above the WIFSIGNALED()
conditional. I haven't moved it, however, as the return code is
checked in, eg FreeBSDPackageList(). Nonetheless it may be, given
that people were seeing funky behaviour with RPM packages, that the
return value of cfpopen() was causing trouble.
--- cfengine-2.2.7/src/cfexecd.c.orig 2008-07-14 17:08:01.997279000 +0100
+++ cfengine-2.2.7/src/cfexecd.c 2008-07-15 10:17:02.008138000 +0100
@@ -722,6 +722,8 @@
if (!md)
{
+ /* Don't leak file descriptors. */
+ fclose(file);
return 0;
}
--- cfengine-2.2.7/src/popen.c.orig 2008-06-10 18:40:26.000000000 +0100
+++ cfengine-2.2.7/src/popen.c 2008-07-15 14:10:34.550638000 +0100
@@ -174,6 +174,8 @@
if ((pp = fdopen(pd[0],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
break;
@@ -184,6 +186,8 @@
if ((pp = fdopen(pd[1],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
}
@@ -192,6 +196,8 @@
{
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than
MAXFD, check for defunct children", fileno(pp), pid);
CfLog(cferror,OUTPUT,"");
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
else
@@ -340,6 +346,8 @@
if ((pp = fdopen(pd[0],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
break;
@@ -350,6 +358,8 @@
if ((pp = fdopen(pd[1],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
}
@@ -358,6 +368,8 @@
{
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than
MAXFD, check for defunct children", fileno(pp), pid);
CfLog(cferror,OUTPUT,"");
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
else
@@ -456,6 +468,8 @@
if ((pp = fdopen(pd[0],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
break;
@@ -466,6 +480,8 @@
if ((pp = fdopen(pd[1],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
}
@@ -474,6 +490,8 @@
{
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than
MAXFD, check for defunct children", fileno(pp), pid);
CfLog(cferror,OUTPUT,"");
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
else
@@ -596,6 +614,8 @@
if ((pp = fdopen(pd[0],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
break;
@@ -606,6 +626,8 @@
if ((pp = fdopen(pd[1],type)) == NULL)
{
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
}
@@ -614,6 +636,8 @@
{
snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child %d higher than
MAXFD, check for defunct children", fileno(pp), pid);
CfLog(cferror,OUTPUT,"");
+ /* Don't leave zombies. */
+ cfpwait(pid);
return NULL;
}
else
@@ -631,43 +655,11 @@
/* Close commands
*/
/******************************************************************************/
-int cfpclose(FILE *pp)
+int cfpwait(pid_t *pid)
-{ int fd, status, wait_result;
- pid_t pid;
+{ int status, wait_result;
-Debug("cfpclose(pp)\n");
-
-if (CHILD == NULL) /* popen hasn't been called */
- {
- return -1;
- }
-
-ALARM_PID = -1;
-fd = fileno(pp);
-
-if (fd >= MAXFD)
- {
- snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child higher than "
- "MAXFD, check for defunct children", fd);
- CfLog(cferror,OUTPUT,"");
- fclose(pp);
- return -1;
- }
-
-if ((pid = CHILD[fd]) == 0)
- {
- return -1;
- }
-
-CHILD[fd] = 0;
-
-if (fclose(pp) == EOF)
- {
- return -1;
- }
-
-Debug("cfpopen - Waiting for process %d\n",pid);
+Debug("cfpwait - Waiting for process %d\n",pid);
#ifdef HAVE_WAITPID
@@ -707,6 +699,48 @@
#endif
}
+int cfpclose(FILE *pp)
+
+{ int fd;
+ pid_t pid;
+
+Debug("cfpclose(pp)\n");
+
+if (CHILD == NULL) /* popen hasn't been called */
+ {
+ return -1;
+ }
+
+ALARM_PID = -1;
+fd = fileno(pp);
+
+if (fd >= MAXFD)
+ {
+ snprintf(OUTPUT,CF_BUFSIZE,"File descriptor %d of child higher than "
+ "MAXFD, check for defunct children", fd);
+ CfLog(cferror,OUTPUT,"");
+ /* We can't wait for the specific pid as it isn't stored in CHILD. */
+ cfpwait(-1);
+ return -1;
+ }
+else
+ {
+ if ((pid = CHILD[fd]) == 0)
+ {
+ return -1;
+ }
+
+ CHILD[fd] = 0;
+ }
+
+if (fclose(pp) == EOF)
+ {
+ return -1;
+ }
+
+return cfpwait(pid);
+}
+
/*******************************************************************/
int CfSetUid(uid_t uid,gid_t gid)
--- cfengine-2.2.7/src/prototypes.h.orig 2008-07-15 09:51:45.676513000
+0100
+++ cfengine-2.2.7/src/prototypes.h 2008-07-15 09:50:26.306308000 +0100
@@ -918,6 +918,7 @@
FILE *cfpopen (char *command, char *type);
FILE *cfpopen_sh (char *command, char *type);
FILE *cfpopen_shsetuid (char *command, char *type, uid_t uid, gid_t gid, char
*chdirv, char *chrootv);
+int cfpwait (pid_t *pid);
int cfpclose (FILE *pp);
int cfpclose_def (FILE *pp, char *defines, char *elsedef);
int CfSetUid(uid_t uid,gid_t gid);
_______________________________________________
Bug-cfengine mailing list
[email protected]
https://cfengine.org/mailman/listinfo/bug-cfengine