(some background is in the thread starting with
http://mail-archives.apache.org/mod_mbox/httpd-dev/201011.mbox/%[email protected]%3E)
end result of this patch:
a module can implement the end-generation hook to find out when the
last mpm child of a particular generation has exited
a pool seems to be the natural feature to help implement resources
with the lifetime of a generation, but as a generation lasts beyond
the DSO load/unload cycle during graceful restart, I fear an endless
stream of module bugs caused by generation pool cleanup functions
disappearing with the DSO unload
implementation:
each MPM runs a mpm-child-status hook when an MPM child is created or exits
core implements that hook and keeps counters of processes for each
active generation; when the count goes to zero, it runs the
end-generation hook with the generation number as a parameter
(that hook is also available to modules which might want a simple way
to track child process counts for other reasons)
cgid is also in the patch as a proof of concept of the end-generation
hook; the generation is part of the socket name, and a generation's
daemon process isn't terminated until the last child of that
generation exits; cgid needs a bit more work to cleanly handle this
though, and I haven't looked at other mods with graceful restart
issues
what's next?
- perhaps someone will confirm that the end-generation hook seems
usable, or otherwise comment on the patch
- extend the use of the mpm-child-status hook to cover all paths in
all bundled MPMs
- get motivation/time to really fix cgid
Index: server/mpm/prefork/prefork.c
===================================================================
--- server/mpm/prefork/prefork.c (revision 1075522)
+++ server/mpm/prefork/prefork.c (working copy)
@@ -766,6 +766,7 @@
}
ap_scoreboard_image->parent[slot].pid = pid;
+ ap_run_mpm_child_status(ap_server_conf, pid, my_generation, slot,
MPM_CHILD_STARTED);
return 0;
}
@@ -998,6 +999,9 @@
/* non-fatal death... note that it's gone in the scoreboard. */
child_slot = ap_find_child_by_pid(&pid);
if (child_slot >= 0) {
+ ap_run_mpm_child_status(ap_server_conf,
ap_scoreboard_image->parent[child_slot].pid,
+
ap_scoreboard_image->parent[child_slot].generation,
+ child_slot, MPM_CHILD_EXITED);
(void) ap_update_child_status_from_indexes(child_slot, 0,
SERVER_DEAD,
(request_rec *)
NULL);
if (processed_status == APEXIT_CHILDSICK) {
Index: server/core.c
===================================================================
--- server/core.c (revision 1075522)
+++ server/core.c (working copy)
@@ -4386,6 +4386,7 @@
APR_OPTIONAL_HOOK(proxy, create_req, core_create_proxy_req, NULL, NULL,
APR_HOOK_MIDDLE);
ap_hook_pre_mpm(ap_create_scoreboard, NULL, NULL, APR_HOOK_MIDDLE);
+ ap_hook_mpm_child_status(ap_core_mpm_child_status, NULL, NULL,
APR_HOOK_MIDDLE);
/* register the core's insert_filter hook and register core-provided
* filters
Index: server/mpm_common.c
===================================================================
--- server/mpm_common.c (revision 1075522)
+++ server/mpm_common.c (working copy)
@@ -68,6 +68,8 @@
APR_HOOK_LINK(mpm_note_child_killed)
APR_HOOK_LINK(mpm_register_timed_callback)
APR_HOOK_LINK(mpm_get_name)
+ APR_HOOK_LINK(end_generation)
+ APR_HOOK_LINK(mpm_child_status)
)
AP_IMPLEMENT_HOOK_RUN_ALL(int, fatal_exception,
(ap_exception_info_t *ei), (ei), OK, DECLINED)
@@ -80,6 +82,8 @@
APR_HOOK_LINK(mpm_note_child_killed)
APR_HOOK_LINK(mpm_register_timed_callback)
APR_HOOK_LINK(mpm_get_name)
+ APR_HOOK_LINK(end_generation)
+ APR_HOOK_LINK(mpm_child_status)
)
#endif
AP_IMPLEMENT_HOOK_RUN_ALL(int, monitor,
@@ -102,7 +106,23 @@
AP_IMPLEMENT_HOOK_RUN_FIRST(const char *, mpm_get_name,
(void),
(), NULL)
+AP_IMPLEMENT_HOOK_VOID(end_generation,
+ (server_rec *s, ap_generation_t gen),
+ (s, gen))
+AP_IMPLEMENT_HOOK_VOID(mpm_child_status,
+ (server_rec *s, pid_t pid, ap_generation_t gen, int
slot, mpm_child_status status),
+ (s,pid,gen,slot,status))
+typedef struct mpm_gen_info_t {
+ APR_RING_ENTRY(mpm_gen_info_t) link;
+ int gen; /* which gen? */
+ int active; /* number of active processes */
+} mpm_gen_info_t;
+
+APR_RING_HEAD(mpm_gen_info_head_t, mpm_gen_info_t);
+static struct mpm_gen_info_head_t geninfo, unused_geninfo;
+static int gen_head_init; /* yuck */
+
/* number of calls to wait_or_timeout between writable probes */
#ifndef INTERVAL_OF_WRITABLE_PROBES
#define INTERVAL_OF_WRITABLE_PROBES 10
@@ -366,6 +386,63 @@
return ap_run_mpm_note_child_killed(childnum);
}
+AP_DECLARE(void) ap_core_mpm_child_status(server_rec *s, pid_t pid,
ap_generation_t gen, int slot, mpm_child_status status)
+{
+ mpm_gen_info_t *cur;
+
+ ap_log_error(APLOG_MARK, APLOG_TRACE4, 0, s,
+ "mpm child %" APR_PID_T_FMT " (gen %d), status %d",
+ pid, gen, status);
+
+ if (!gen_head_init) { /* where to run this? */
+ gen_head_init = 1;
+ APR_RING_INIT(&geninfo, mpm_gen_info_t, link);
+ APR_RING_INIT(&unused_geninfo, mpm_gen_info_t, link);
+ }
+
+ cur = APR_RING_FIRST(&geninfo);
+ while (cur != APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link) &&
+ cur->gen != gen) {
+ cur = APR_RING_NEXT(cur, link);
+ }
+
+ switch(status) {
+ case MPM_CHILD_STARTED:
+ if (cur == APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link)) {
+ /* first child for this generation */
+ if (!APR_RING_EMPTY(&unused_geninfo, mpm_gen_info_t, link)) {
+ cur = APR_RING_FIRST(&unused_geninfo);
+ APR_RING_REMOVE(cur, link);
+ }
+ else {
+ cur = apr_pcalloc(s->process->pool, sizeof *cur);
+ }
+ cur->gen = gen;
+ APR_RING_ELEM_INIT(cur, link);
+ APR_RING_INSERT_HEAD(&geninfo, cur, mpm_gen_info_t, link);
+ }
+ ++cur->active;
+ break;
+ case MPM_CHILD_EXITED:
+ if (cur == APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link)) {
+ ap_log_error(APLOG_MARK, APLOG_ERR, 0, s,
+ "no record of generation %d of exiting child %"
APR_PID_T_FMT,
+ gen, pid);
+ }
+ else {
+ --cur->active;
+ if (!cur->active) {
+ ap_log_error(APLOG_MARK, APLOG_TRACE3, 0, ap_server_conf,
+ "running end-generation-%d hooks", gen);
+ ap_run_end_generation(ap_server_conf, gen);
+ APR_RING_REMOVE(cur, link);
+ APR_RING_INSERT_HEAD(&unused_geninfo, cur, mpm_gen_info_t,
link);
+ }
+ }
+ break;
+ }
+}
+
AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t,
ap_mpm_callback_fn_t *cbfn, void *baton)
{
return ap_run_mpm_register_timed_callback(t, cbfn, baton);
Index: modules/generators/mod_cgid.c
===================================================================
--- modules/generators/mod_cgid.c (revision 1075522)
+++ modules/generators/mod_cgid.c (working copy)
@@ -91,8 +91,23 @@
static struct sockaddr_un *server_addr;
static apr_socklen_t server_addr_len;
static pid_t parent_pid;
+static ap_generation_t generation;
static ap_unix_identity_t empty_ugid = { (uid_t)-1, (gid_t)-1, -1 };
+typedef struct cgid_geninfo_t {
+ int in_use;
+ pid_t pid;
+ ap_generation_t gen;
+ char sockname[1024];
+} cgid_geninfo_t;
+
+typedef struct cgid_retained_t {
+ apr_pool_t *p;
+ cgid_geninfo_t geninfo[20];
+} cgid_retained_t;
+
+static cgid_retained_t *retained;
+
/* The APR other-child API doesn't tell us how the daemon exited
* (SIGSEGV vs. exit(1)). The other-child maintenance function
* needs to decide whether to restart the daemon after a failure
@@ -263,13 +278,92 @@
return av;
}
+static void cleanup_daemon(pid_t pid, ap_generation_t gen, const char
*sockname)
+{
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+ "terminating daemon process %" APR_PID_T_FMT
+ " of gen %d", pid, gen);
+
+ /* send signal to daemon telling it to die */
+ if (kill(pid, SIGHUP) < 0) {
+ ap_log_error(APLOG_MARK, APLOG_INFO, errno, ap_server_conf,
+ "couldn't kill daemon process %" APR_PID_T_FMT
+ " of gen %d", pid, gen);
+ }
+
+ /* Remove the cgi socket, we must do it here in order to try and
+ * guarantee the same permissions as when the socket was created.
+ */
+ if (unlink(sockname) < 0 && errno != ENOENT) {
+ ap_log_error(APLOG_MARK, APLOG_ERR, errno, ap_server_conf,
+ "Couldn't unlink unix domain socket %s",
+ sockname);
+ }
+}
+
+static void defer_cleanup(pid_t pid, const char *sockname)
+{
+ int i;
+
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+ "deferring cleanup of daemon %" APR_PID_T_FMT
+ " during graceful restart for gen %d", pid,
+ generation);
+ for (i = 0; i < 20 && retained->geninfo[i].in_use; i++) {
+ }
+
+ if (i < 20) {
+ cgid_geninfo_t *geninfo = & retained->geninfo[i];
+
+ geninfo->in_use = 1;
+ geninfo->pid = pid;
+ geninfo->gen = generation;
+ apr_cpystrn(geninfo->sockname, sockname,
+ sizeof geninfo->sockname);
+ }
+ else {
+ ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf,
+ "cleaning up daemon %" APR_PID_T_FMT
+ " of gen %d prematurely", pid, generation);
+ cleanup_daemon(pid, generation, sockname);
+ }
+}
+
+static void cgid_end_generation(server_rec *s, ap_generation_t gen)
+{
+ int i;
+
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, s, "cgid_end_generation(%d)",
gen);
+
+ for (i = 0; i < 20 &&
+ !(retained->geninfo[i].in_use && retained->geninfo[i].gen ==
gen); i++) {
+ }
+
+ if (i < 20) {
+ cgid_geninfo_t *geninfo = & retained->geninfo[i];
+
+ geninfo->in_use = 0;
+
+ cleanup_daemon(geninfo->pid, geninfo->gen, geninfo->sockname);
+ }
+ else {
+ ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf,
+ "couldn't find cgid daemon for generation %d", gen);
+ }
+}
+
#if APR_HAS_OTHER_CHILD
static void cgid_maint(int reason, void *data, apr_wait_t status)
{
apr_proc_t *proc = data;
+ apr_status_t rv;
int mpm_state;
int stopping;
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+ "cgid_maint(%d)",
+ reason);
+
switch (reason) {
case APR_OC_REASON_DEATH:
apr_proc_other_child_unregister(data);
@@ -308,16 +402,23 @@
/* we get here when pcgi is cleaned up; pcgi gets cleaned
* up when pconf gets cleaned up
*/
- kill(proc->pid, SIGHUP); /* send signal to daemon telling it to
die */
-
- /* Remove the cgi socket, we must do it here in order to try and
- * guarantee the same permissions as when the socket was created.
- */
- if (unlink(sockname) < 0 && errno != ENOENT) {
- ap_log_error(APLOG_MARK, APLOG_ERR, errno, ap_server_conf,
- "Couldn't unlink unix domain socket %s",
- sockname);
+ rv = ap_mpm_query(AP_MPMQ_MPM_STATE, &mpm_state);
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, rv, ap_server_conf,
+ "cgid_maint(UNREGISTER), mpm state %d",
+ rv == APR_SUCCESS ? mpm_state : -1);
+ if (rv == APR_SUCCESS) { /* stop or regular restart */
+ cleanup_daemon(proc->pid, generation, sockname);
}
+ else {
+ defer_cleanup(proc->pid, sockname);
+ /* whether or not you SHOULD be able to kill a pool/proc
association,
+ * APR currently doesn't allow it; Jeff has an APR patch which
+ * searches the list and removes the association for
+ * APR_KILL_NEVER (which otherwise seems to be a useless
+ * parameter to this function)
+ */
+ apr_pool_note_subprocess(retained->p, proc, APR_KILL_NEVER);
+ }
break;
}
}
@@ -852,6 +953,8 @@
apr_hash_set(script_hash, key, sizeof(cgid_req.conn_id),
(void *)((long)procnew->pid));
}
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+ "cgid daemon pid %" APR_PID_T_FMT " exiting", getpid());
return -1; /* should be <= 0 to distinguish from startup errors */
}
@@ -874,9 +977,12 @@
procnew->pid = daemon_pid;
procnew->err = procnew->in = procnew->out = NULL;
apr_pool_note_subprocess(p, procnew, APR_KILL_AFTER_TIMEOUT);
+ retained->p = p;
#if APR_HAS_OTHER_CHILD
apr_proc_other_child_register(procnew, cgid_maint, procnew, NULL, p);
#endif
+ ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, main_server,
+ "cgid daemon process %" APR_PID_T_FMT " started", daemon_pid);
return OK;
}
@@ -906,9 +1012,12 @@
procnew->err = procnew->in = procnew->out = NULL;
apr_pool_userdata_set((const void *)procnew, userdata_key,
apr_pool_cleanup_null, main_server->process->pool);
+
+ retained = ap_retained_data_create("cgid-retained", sizeof(*retained));
}
else {
procnew = data;
+ retained = ap_retained_data_get("cgid-retained");
}
if (ap_state_query(AP_SQ_MAIN_STATE) != AP_SQ_MS_CREATE_PRE_CONFIG) {
@@ -927,6 +1036,11 @@
}
sockname = tmp_sockname;
+ /* hackola */
+ generation = 999;
+ ap_mpm_query(AP_MPMQ_GENERATION, &generation);
+ sockname = apr_psprintf(p, "%s.%d", sockname, generation); /* doesn't
check for truncation */
+
server_addr_len = APR_OFFSETOF(struct sockaddr_un, sun_path) +
strlen(sockname);
server_addr = (struct sockaddr_un *)apr_palloc(p, server_addr_len + 1);
server_addr->sun_family = AF_UNIX;
@@ -1876,6 +1990,7 @@
ap_hook_pre_config(cgid_pre_config, NULL, NULL, APR_HOOK_MIDDLE);
ap_hook_post_config(cgid_init, aszPre, NULL, APR_HOOK_MIDDLE);
ap_hook_handler(cgid_handler, NULL, NULL, APR_HOOK_MIDDLE);
+ ap_hook_end_generation(cgid_end_generation, NULL, NULL, APR_HOOK_MIDDLE);
}
AP_DECLARE_MODULE(cgid) = {
Index: include/mpm_common.h
===================================================================
--- include/mpm_common.h (revision 1075522)
+++ include/mpm_common.h (working copy)
@@ -39,6 +39,7 @@
#include "ap_config.h"
#include "ap_mpm.h"
+#include "scoreboard.h"
#if APR_HAVE_NETINET_TCP_H
#include <netinet/tcp.h> /* for TCP_NODELAY */
@@ -313,6 +314,19 @@
AP_DECLARE_HOOK(int,monitor,(apr_pool_t *p, server_rec *s))
+typedef enum mpm_child_state {
+ MPM_CHILD_STARTED,
+ MPM_CHILD_EXITED
+} mpm_child_status;
+
+AP_DECLARE_HOOK(void,mpm_child_status,(server_rec *s, pid_t pid,
ap_generation_t gen, int slot,
+ mpm_child_status status))
+/* core's implementation of mpm_child_status hook */
+AP_DECLARE(void) ap_core_mpm_child_status(server_rec *s, pid_t pid,
ap_generation_t gen,
+ int slot, mpm_child_status status);
+
+AP_DECLARE_HOOK(void,end_generation,(server_rec *s, ap_generation_t gen))
+
/* register modules that undertake to manage system security */
AP_DECLARE(int) ap_sys_privileges_handlers(int inc);
AP_DECLARE_HOOK(int, drop_privileges, (apr_pool_t * pchild, server_rec * s))