(some background is in the thread starting with
http://mail-archives.apache.org/mod_mbox/httpd-dev/201011.mbox/%[email protected]%3E)

end result of this patch:

a module can implement the end-generation hook to find out when the
last mpm child of a particular generation has exited

a pool seems to be the natural feature to help implement resources
with the lifetime of a generation, but as a generation lasts beyond
the DSO load/unload cycle during graceful restart, I fear an endless
stream of module bugs caused by generation pool cleanup functions
disappearing with the DSO unload

implementation:

each MPM runs a mpm-child-status hook when an MPM child is created or exits

core implements that hook and keeps counters of processes for each
active generation; when the count goes to zero, it runs the
end-generation hook with the generation number as a parameter
(that hook is also available to modules which might want a simple way
to track child process counts for other reasons)

cgid is also in the patch as a proof of concept of the end-generation
hook; the generation is part of the socket name, and a generation's
daemon process isn't terminated until the last child of that
generation exits; cgid needs a bit more work to cleanly handle this
though, and I haven't looked at other mods with graceful restart
issues

what's next?

- perhaps someone will confirm that the end-generation hook seems
usable, or otherwise comment on the patch
- extend the use of the mpm-child-status hook to cover all paths in
all bundled MPMs
- get motivation/time to really fix cgid
Index: server/mpm/prefork/prefork.c
===================================================================
--- server/mpm/prefork/prefork.c        (revision 1075522)
+++ server/mpm/prefork/prefork.c        (working copy)
@@ -766,6 +766,7 @@
     }
 
     ap_scoreboard_image->parent[slot].pid = pid;
+    ap_run_mpm_child_status(ap_server_conf, pid, my_generation, slot, 
MPM_CHILD_STARTED);
 
     return 0;
 }
@@ -998,6 +999,9 @@
             /* non-fatal death... note that it's gone in the scoreboard. */
             child_slot = ap_find_child_by_pid(&pid);
             if (child_slot >= 0) {
+                ap_run_mpm_child_status(ap_server_conf, 
ap_scoreboard_image->parent[child_slot].pid,
+                                        
ap_scoreboard_image->parent[child_slot].generation,
+                                        child_slot, MPM_CHILD_EXITED);
                 (void) ap_update_child_status_from_indexes(child_slot, 0, 
SERVER_DEAD,
                                                            (request_rec *) 
NULL);
                 if (processed_status == APEXIT_CHILDSICK) {
Index: server/core.c
===================================================================
--- server/core.c       (revision 1075522)
+++ server/core.c       (working copy)
@@ -4386,6 +4386,7 @@
     APR_OPTIONAL_HOOK(proxy, create_req, core_create_proxy_req, NULL, NULL,
                       APR_HOOK_MIDDLE);
     ap_hook_pre_mpm(ap_create_scoreboard, NULL, NULL, APR_HOOK_MIDDLE);
+    ap_hook_mpm_child_status(ap_core_mpm_child_status, NULL, NULL, 
APR_HOOK_MIDDLE);
 
     /* register the core's insert_filter hook and register core-provided
      * filters
Index: server/mpm_common.c
===================================================================
--- server/mpm_common.c (revision 1075522)
+++ server/mpm_common.c (working copy)
@@ -68,6 +68,8 @@
     APR_HOOK_LINK(mpm_note_child_killed)
     APR_HOOK_LINK(mpm_register_timed_callback)
     APR_HOOK_LINK(mpm_get_name)
+    APR_HOOK_LINK(end_generation)
+    APR_HOOK_LINK(mpm_child_status)
 )
 AP_IMPLEMENT_HOOK_RUN_ALL(int, fatal_exception,
                           (ap_exception_info_t *ei), (ei), OK, DECLINED)
@@ -80,6 +82,8 @@
     APR_HOOK_LINK(mpm_note_child_killed)
     APR_HOOK_LINK(mpm_register_timed_callback)
     APR_HOOK_LINK(mpm_get_name)
+    APR_HOOK_LINK(end_generation)
+    APR_HOOK_LINK(mpm_child_status)
 )
 #endif
 AP_IMPLEMENT_HOOK_RUN_ALL(int, monitor,
@@ -102,7 +106,23 @@
 AP_IMPLEMENT_HOOK_RUN_FIRST(const char *, mpm_get_name,
                             (void),
                             (), NULL)
+AP_IMPLEMENT_HOOK_VOID(end_generation,
+                       (server_rec *s, ap_generation_t gen),
+                       (s, gen))
+AP_IMPLEMENT_HOOK_VOID(mpm_child_status,
+                       (server_rec *s, pid_t pid, ap_generation_t gen, int 
slot, mpm_child_status status),
+                       (s,pid,gen,slot,status))
 
+typedef struct mpm_gen_info_t {
+    APR_RING_ENTRY(mpm_gen_info_t) link;
+    int gen;          /* which gen? */
+    int active;       /* number of active processes */
+} mpm_gen_info_t;
+
+APR_RING_HEAD(mpm_gen_info_head_t, mpm_gen_info_t);
+static struct mpm_gen_info_head_t geninfo, unused_geninfo;
+static int gen_head_init; /* yuck */
+
 /* number of calls to wait_or_timeout between writable probes */
 #ifndef INTERVAL_OF_WRITABLE_PROBES
 #define INTERVAL_OF_WRITABLE_PROBES 10
@@ -366,6 +386,63 @@
     return ap_run_mpm_note_child_killed(childnum);
 }
 
+AP_DECLARE(void) ap_core_mpm_child_status(server_rec *s, pid_t pid, 
ap_generation_t gen, int slot, mpm_child_status status)
+{
+    mpm_gen_info_t *cur;
+
+    ap_log_error(APLOG_MARK, APLOG_TRACE4, 0, s,
+                 "mpm child %" APR_PID_T_FMT " (gen %d), status %d",
+                 pid, gen, status);
+
+    if (!gen_head_init) { /* where to run this? */
+        gen_head_init = 1;
+        APR_RING_INIT(&geninfo, mpm_gen_info_t, link);
+        APR_RING_INIT(&unused_geninfo, mpm_gen_info_t, link);
+    }
+
+    cur = APR_RING_FIRST(&geninfo);
+    while (cur != APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link) &&
+           cur->gen != gen) {
+        cur = APR_RING_NEXT(cur, link);
+    }
+
+    switch(status) {
+    case MPM_CHILD_STARTED:
+        if (cur == APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link)) {
+            /* first child for this generation */
+            if (!APR_RING_EMPTY(&unused_geninfo, mpm_gen_info_t, link)) {
+                cur = APR_RING_FIRST(&unused_geninfo);
+                APR_RING_REMOVE(cur, link);
+            }
+            else {
+                cur = apr_pcalloc(s->process->pool, sizeof *cur);
+            }
+            cur->gen = gen;
+            APR_RING_ELEM_INIT(cur, link);
+            APR_RING_INSERT_HEAD(&geninfo, cur, mpm_gen_info_t, link);
+        }
+        ++cur->active;
+        break;
+    case MPM_CHILD_EXITED:
+        if (cur == APR_RING_SENTINEL(&geninfo, mpm_gen_info_t, link)) {
+            ap_log_error(APLOG_MARK, APLOG_ERR, 0, s,
+                         "no record of generation %d of exiting child %" 
APR_PID_T_FMT,
+                         gen, pid);
+        }
+        else {
+            --cur->active;
+            if (!cur->active) {
+                ap_log_error(APLOG_MARK, APLOG_TRACE3, 0, ap_server_conf,
+                             "running end-generation-%d hooks", gen);
+                ap_run_end_generation(ap_server_conf, gen);
+                APR_RING_REMOVE(cur, link);
+                APR_RING_INSERT_HEAD(&unused_geninfo, cur, mpm_gen_info_t, 
link);
+            }
+        }
+        break;
+    }
+}
+
 AP_DECLARE(apr_status_t) ap_mpm_register_timed_callback(apr_time_t t, 
ap_mpm_callback_fn_t *cbfn, void *baton)
 {
     return ap_run_mpm_register_timed_callback(t, cbfn, baton);
Index: modules/generators/mod_cgid.c
===================================================================
--- modules/generators/mod_cgid.c       (revision 1075522)
+++ modules/generators/mod_cgid.c       (working copy)
@@ -91,8 +91,23 @@
 static struct sockaddr_un *server_addr;
 static apr_socklen_t server_addr_len;
 static pid_t parent_pid;
+static ap_generation_t generation;
 static ap_unix_identity_t empty_ugid = { (uid_t)-1, (gid_t)-1, -1 };
 
+typedef struct cgid_geninfo_t {
+    int in_use;
+    pid_t pid;
+    ap_generation_t gen;
+    char sockname[1024];
+} cgid_geninfo_t;
+
+typedef struct cgid_retained_t {
+    apr_pool_t *p;
+    cgid_geninfo_t geninfo[20];
+} cgid_retained_t;
+
+static cgid_retained_t *retained;
+
 /* The APR other-child API doesn't tell us how the daemon exited
  * (SIGSEGV vs. exit(1)).  The other-child maintenance function
  * needs to decide whether to restart the daemon after a failure
@@ -263,13 +278,92 @@
     return av;
 }
 
+static void cleanup_daemon(pid_t pid, ap_generation_t gen, const char 
*sockname)
+{
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                 "terminating daemon process %" APR_PID_T_FMT
+                 " of gen %d", pid, gen);
+
+    /* send signal to daemon telling it to die */
+    if (kill(pid, SIGHUP) < 0) {
+        ap_log_error(APLOG_MARK, APLOG_INFO, errno, ap_server_conf,
+                     "couldn't kill daemon process %" APR_PID_T_FMT
+                     " of gen %d", pid, gen);
+    }
+
+    /* Remove the cgi socket, we must do it here in order to try and
+     * guarantee the same permissions as when the socket was created.
+     */
+    if (unlink(sockname) < 0 && errno != ENOENT) {
+        ap_log_error(APLOG_MARK, APLOG_ERR, errno, ap_server_conf,
+                     "Couldn't unlink unix domain socket %s",
+                     sockname);
+    }
+}
+
+static void defer_cleanup(pid_t pid, const char *sockname)
+{
+    int i;
+    
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                 "deferring cleanup of daemon %" APR_PID_T_FMT
+                 " during graceful restart for gen %d", pid,
+                 generation);
+    for (i = 0; i < 20 && retained->geninfo[i].in_use; i++) {
+    }
+
+    if (i < 20) {
+        cgid_geninfo_t *geninfo = & retained->geninfo[i];
+
+        geninfo->in_use = 1;
+        geninfo->pid = pid;
+        geninfo->gen = generation;
+        apr_cpystrn(geninfo->sockname, sockname,
+                    sizeof geninfo->sockname);
+    }
+    else {
+        ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf,
+                     "cleaning up daemon %" APR_PID_T_FMT
+                     " of gen %d prematurely", pid, generation);
+        cleanup_daemon(pid, generation, sockname);
+    }
+}
+
+static void cgid_end_generation(server_rec *s, ap_generation_t gen)
+{
+    int i;
+
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, s, "cgid_end_generation(%d)", 
gen);
+
+    for (i = 0; i < 20 &&
+             !(retained->geninfo[i].in_use && retained->geninfo[i].gen == 
gen); i++) {
+    }
+
+    if (i < 20) {
+        cgid_geninfo_t *geninfo = & retained->geninfo[i];
+
+        geninfo->in_use = 0;
+        
+        cleanup_daemon(geninfo->pid, geninfo->gen, geninfo->sockname);
+    }
+    else {
+        ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf,
+                     "couldn't find cgid daemon for generation %d", gen);
+    }
+}
+
 #if APR_HAS_OTHER_CHILD
 static void cgid_maint(int reason, void *data, apr_wait_t status)
 {
     apr_proc_t *proc = data;
+    apr_status_t rv;
     int mpm_state;
     int stopping;
 
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                 "cgid_maint(%d)",
+                 reason);
+
     switch (reason) {
         case APR_OC_REASON_DEATH:
             apr_proc_other_child_unregister(data);
@@ -308,16 +402,23 @@
             /* we get here when pcgi is cleaned up; pcgi gets cleaned
              * up when pconf gets cleaned up
              */
-            kill(proc->pid, SIGHUP); /* send signal to daemon telling it to 
die */
-
-            /* Remove the cgi socket, we must do it here in order to try and
-             * guarantee the same permissions as when the socket was created.
-             */
-            if (unlink(sockname) < 0 && errno != ENOENT) {
-                ap_log_error(APLOG_MARK, APLOG_ERR, errno, ap_server_conf,
-                             "Couldn't unlink unix domain socket %s",
-                             sockname);
+            rv = ap_mpm_query(AP_MPMQ_MPM_STATE, &mpm_state);
+            ap_log_error(APLOG_MARK, APLOG_TRACE5, rv, ap_server_conf,
+                         "cgid_maint(UNREGISTER), mpm state %d",
+                         rv == APR_SUCCESS ? mpm_state : -1);
+            if (rv == APR_SUCCESS) { /* stop or regular restart */
+                cleanup_daemon(proc->pid, generation, sockname);
             }
+            else {
+                defer_cleanup(proc->pid, sockname);
+                /* whether or not you SHOULD be able to kill a pool/proc 
association,
+                 * APR currently doesn't allow it; Jeff has an APR patch which
+                 * searches the list and removes the association for
+                 * APR_KILL_NEVER (which otherwise seems to be a useless
+                 * parameter to this function)
+                 */
+                apr_pool_note_subprocess(retained->p, proc, APR_KILL_NEVER);
+            }
             break;
     }
 }
@@ -852,6 +953,8 @@
         apr_hash_set(script_hash, key, sizeof(cgid_req.conn_id),
                      (void *)((long)procnew->pid));
     }
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                 "cgid daemon pid %" APR_PID_T_FMT " exiting", getpid());
     return -1; /* should be <= 0 to distinguish from startup errors */
 }
 
@@ -874,9 +977,12 @@
     procnew->pid = daemon_pid;
     procnew->err = procnew->in = procnew->out = NULL;
     apr_pool_note_subprocess(p, procnew, APR_KILL_AFTER_TIMEOUT);
+    retained->p = p;
 #if APR_HAS_OTHER_CHILD
     apr_proc_other_child_register(procnew, cgid_maint, procnew, NULL, p);
 #endif
+    ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, main_server,
+                 "cgid daemon process %" APR_PID_T_FMT " started", daemon_pid);
     return OK;
 }
 
@@ -906,9 +1012,12 @@
         procnew->err = procnew->in = procnew->out = NULL;
         apr_pool_userdata_set((const void *)procnew, userdata_key,
                      apr_pool_cleanup_null, main_server->process->pool);
+
+        retained = ap_retained_data_create("cgid-retained", sizeof(*retained));
     }
     else {
         procnew = data;
+        retained = ap_retained_data_get("cgid-retained");
     }
 
     if (ap_state_query(AP_SQ_MAIN_STATE) != AP_SQ_MS_CREATE_PRE_CONFIG) {
@@ -927,6 +1036,11 @@
         }
         sockname = tmp_sockname;
 
+        /* hackola */
+        generation = 999;
+        ap_mpm_query(AP_MPMQ_GENERATION, &generation);
+        sockname = apr_psprintf(p, "%s.%d", sockname, generation); /* doesn't 
check for truncation */
+
         server_addr_len = APR_OFFSETOF(struct sockaddr_un, sun_path) + 
strlen(sockname);
         server_addr = (struct sockaddr_un *)apr_palloc(p, server_addr_len + 1);
         server_addr->sun_family = AF_UNIX;
@@ -1876,6 +1990,7 @@
     ap_hook_pre_config(cgid_pre_config, NULL, NULL, APR_HOOK_MIDDLE);
     ap_hook_post_config(cgid_init, aszPre, NULL, APR_HOOK_MIDDLE);
     ap_hook_handler(cgid_handler, NULL, NULL, APR_HOOK_MIDDLE);
+    ap_hook_end_generation(cgid_end_generation, NULL, NULL, APR_HOOK_MIDDLE);
 }
 
 AP_DECLARE_MODULE(cgid) = {
Index: include/mpm_common.h
===================================================================
--- include/mpm_common.h        (revision 1075522)
+++ include/mpm_common.h        (working copy)
@@ -39,6 +39,7 @@
 
 #include "ap_config.h"
 #include "ap_mpm.h"
+#include "scoreboard.h"
 
 #if APR_HAVE_NETINET_TCP_H
 #include <netinet/tcp.h>    /* for TCP_NODELAY */
@@ -313,6 +314,19 @@
 
 AP_DECLARE_HOOK(int,monitor,(apr_pool_t *p, server_rec *s))
 
+typedef enum mpm_child_state {
+    MPM_CHILD_STARTED,
+    MPM_CHILD_EXITED
+} mpm_child_status;
+
+AP_DECLARE_HOOK(void,mpm_child_status,(server_rec *s, pid_t pid, 
ap_generation_t gen, int slot,
+                                       mpm_child_status status))
+/* core's implementation of mpm_child_status hook */
+AP_DECLARE(void) ap_core_mpm_child_status(server_rec *s, pid_t pid, 
ap_generation_t gen,
+                                          int slot, mpm_child_status status);
+
+AP_DECLARE_HOOK(void,end_generation,(server_rec *s, ap_generation_t gen))
+
 /* register modules that undertake to manage system security */
 AP_DECLARE(int) ap_sys_privileges_handlers(int inc);
 AP_DECLARE_HOOK(int, drop_privileges, (apr_pool_t * pchild, server_rec * s))

Reply via email to