This is another debian patch with lots of changes (see attached). I'm
running out of time for tracing the validity of these and am hoping that
someone else will take a look. This seems to apply to a rather important
part of the whole system, so is probably worth a look.
(Yes, I can submit this to bugzilla, too, if it makes sense to do so.)
Note one minor bug fix in the patch:
+int read_msg(int fd, struct notify_message *msg)
+{
+ ssize_t r;
+ size_t off = 0;
+ int s = sizeof(struct notify_message);
should be
------ --
+int read_msg(int fd, struct notify_message *msg)
+{
+ ssize_t r;
+ size_t off = 0;
+ size_t s = sizeof(struct notify_message);
#! /bin/sh /usr/share/dpatch/dpatch-run
## 13-master_process_handling.dpatch by Sven Mueller <[email protected]>
##
## All lines beginning with `## DP:' are a description of the patch.
## DP: Fixes process (child) handling in master process
@DPATCH@
diff -urNad git~/lib/util.c git/lib/util.c
--- git~/lib/util.c 2010-01-16 19:21:09.000000000 -0200
+++ git/lib/util.c 2010-01-16 19:28:14.288091659 -0200
@@ -365,9 +365,9 @@
int become_cyrus(void)
{
struct passwd *p;
- int newuid, newgid;
+ uid_t newuid, newgid;
int result;
- static int uid = 0;
+ static uid_t uid = 0;
if (uid) return setuid(uid);
diff -urNad git~/master/master.c git/master/master.c
--- git~/master/master.c 2010-01-16 19:28:14.145091738 -0200
+++ git/master/master.c 2010-01-16 19:28:14.289091714 -0200
@@ -174,6 +174,8 @@
static struct centry *ctable[child_table_size];
static struct centry *cfreelist;
+static int child_mourning_time = 2; /* Time in seconds to remember child
+ after processing SIGCHLD */
static int janitor_frequency = 1; /* Janitor sweeps per second */
static int janitor_position; /* Entry to begin at in next sweep */
static struct timeval janitor_mark; /* Last time janitor did a sweep */
@@ -908,7 +910,7 @@
}
}
c->service_state = SERVICE_STATE_DEAD;
- c->janitor_deadline = time(NULL) + 2;
+ c->janitor_deadline = time(NULL) + child_mourning_time;
} else {
/* weird. Are we multithreaded now? we don't know this child */
syslog(LOG_WARNING,
@@ -917,7 +919,7 @@
c = get_centry();
c->pid = pid;
c->service_state = SERVICE_STATE_DEAD;
- c->janitor_deadline = time(NULL) + 2;
+ c->janitor_deadline = time(NULL) + child_mourning_time;
c->si = SERVICE_NONE;
c->next = ctable[pid % child_table_size];
ctable[pid % child_table_size] = c;
@@ -1073,6 +1075,36 @@
}
}
+/*
+ * Receives a message from a service.
+ *
+ * Returns zero if all goes well
+ * 1 if no msg available
+ * 2 if bad message received (incorrectly sized)
+ * -1 on error (errno set)
+ */
+int read_msg(int fd, struct notify_message *msg)
+{
+ ssize_t r;
+ size_t off = 0;
+ int s = sizeof(struct notify_message);
+
+ while (s > 0) {
+ do
+ r = read(fd, msg + off, s);
+ while ((r == -1) && (errno == EINTR));
+ if (r <= 0) break;
+ s -= r;
+ off += r;
+ }
+ if ( ((r == 0) && (off == 0)) ||
+ ((r == -1) && (errno == EAGAIN)) )
+ return 1;
+ if (r == -1) return -1;
+ if (s != 0) return 2;
+ return 0;
+}
+
void process_msg(const int si, struct notify_message *msg)
{
struct centry *c;
@@ -1398,8 +1430,9 @@
snprintf(buf, sizeof(buf),
"cannot find executable for service '%s'", name);
- /* if it is not, we're misconfigured, die. */
- fatal(buf, EX_CONFIG);
+ /* if it is not, we just skip it */
+ syslog(LOG_WARNING, "WARNING: %s -- ignored", buf);
+ return;
}
Services[i].maxforkrate = maxforkrate;
@@ -1411,7 +1444,7 @@
Services[i].desired_workers = prefork;
Services[i].babysit = babysit;
Services[i].max_workers = atoi(max);
- if (Services[i].max_workers == -1) {
+ if (Services[i].max_workers < 0) {
Services[i].max_workers = INT_MAX;
}
} else {
@@ -1419,6 +1452,7 @@
if (prefork > 1) prefork = 1;
Services[i].desired_workers = prefork;
Services[i].max_workers = 1;
+ Services[i].babysit = 0;
}
free(max);
@@ -1458,7 +1492,7 @@
if (!strcmp(cmd,"")) {
char buf[256];
snprintf(buf, sizeof(buf),
- "unable to find command or port for event '%s'", name);
+ "unable to find command for event '%s'", name);
if (ignore_err) {
syslog(LOG_WARNING, "WARNING: %s -- ignored", buf);
@@ -1514,7 +1548,7 @@
rl.rlim_cur = x;
rl.rlim_max = x;
- if (setrlimit(RLIMIT_NUMFDS, &rl) < 0) {
+ if (setrlimit(RLIMIT_NUMFDS, &rl) < 0 && x != RLIM_INFINITY) {
syslog(LOG_ERR, "setrlimit: Unable to set file descriptors limit to
%ld: %m", x);
#ifdef HAVE_GETRLIMIT
@@ -1529,11 +1563,9 @@
}
- if (verbose > 1) {
- r = getrlimit(RLIMIT_NUMFDS, &rl);
- syslog(LOG_DEBUG, "set maximum file descriptors to %ld/%ld",
rl.rlim_cur,
- rl.rlim_max);
- }
+ if (verbose > 1 && getrlimit(RLIMIT_NUMFDS, &rl) >=0)
+ syslog(LOG_DEBUG, "set maximum file descriptors to %ld/%ld",
+ rl.rlim_cur, rl.rlim_max);
#else
}
#endif /* HAVE_GETRLIMIT */
@@ -1567,13 +1599,18 @@
Services[i].stat[0], Services[i].stat[1]);
/* Only free the service info on the primary */
- if(Services[i].associate == 0) {
+ if (Services[i].associate == 0) {
+ free(Services[i].name);
free(Services[i].listen);
free(Services[i].proto);
}
+ Services[i].name = NULL;
Services[i].listen = NULL;
Services[i].proto = NULL;
Services[i].desired_workers = 0;
+ Services[i].nforks = 0;
+ Services[i].nactive = 0;
+ Services[i].nconnections = 0;
/* send SIGHUP to all children */
for (j = 0 ; j < child_table_size ; j++ ) {
@@ -1656,9 +1693,9 @@
p = getenv("CYRUS_VERBOSE");
if (p) verbose = atoi(p) + 1;
#ifdef HAVE_NETSNMP
- while ((opt = getopt(argc, argv, "C:M:p:l:Ddj:P:x:")) != EOF) {
+ while ((opt = getopt(argc, argv, "C:M:p:l:DdjJ:P:x:")) != EOF) {
#else
- while ((opt = getopt(argc, argv, "C:M:p:l:Ddj:")) != EOF) {
+ while ((opt = getopt(argc, argv, "C:M:p:l:DdjJ:")) != EOF) {
#endif
switch (opt) {
case 'C': /* alt imapd.conf file */
@@ -1691,8 +1728,15 @@
/* Janitor frequency */
janitor_frequency = atoi(optarg);
if(janitor_frequency < 1)
- fatal("The janitor period must be at least 1 second",
EX_CONFIG);
+ fatal("The janitor frequency must be at least once per second",
EX_CONFIG);
break;
+ case 'J':
+ /* Janitor delay before cleanup of a child */
+ child_mourning_time = atoi(optarg);
+ if(child_mourning_time < 1)
+ fatal("The janitor's mourning time interval must be at least 1
second",
+ EX_CONFIG);
+ break;
#ifdef HAVE_NETSNMP
case 'P': /* snmp AgentXPingInterval */
agentxpinginterval = atoi(optarg);
@@ -2068,13 +2112,19 @@
int j;
if (FD_ISSET(x, &rfds)) {
- r = read(x, &msg, sizeof(msg));
- if (r != sizeof(msg)) {
- syslog(LOG_ERR, "got incorrectly sized response from child:
%x", i);
+ while ((r = read_msg(x, &msg)) == 0)
+ process_msg(i, &msg);
+
+ if (r == 2) {
+ syslog(LOG_ERR,
+ "got incorrectly sized response from child: %x", i);
+ continue;
+ }
+ if (r < 0) {
+ syslog(LOG_ERR,
+ "error while receiving message from child %x: %m", i);
continue;
}
-
- process_msg(i, &msg);
}
if (Services[i].exec &&
diff -urNad git~/master/master.h git/master/master.h
--- git~/master/master.h 2010-01-16 19:21:09.000000000 -0200
+++ git/master/master.h 2010-01-16 19:28:14.289091714 -0200
@@ -45,6 +45,7 @@
extern struct service *Services;
extern int allocservices;
extern int nservices;
+void sighandler_setup(void);
/*
* Description of multiple address family support from