fielding 97/10/06 08:05:34
Modified: src/main http_main.c Log: Changes in revision 1.218 caused reclaim_child_processes() to be too quick in checking for child exit and sending additional SIGTERM and SIGKILL interrupts. Not a problem for systems with fast process context switching, but Solaris complains bitterly. The fix is to insert a few quiet checks of the child status so that the exponential backoff on the wait time becomes significant before sending additional interrupts and complaining to the error log. Reviewed by: Dean Gaudet Revision Changes Path 1.233 +23 -16 apachen/src/main/http_main.c Index: http_main.c =================================================================== RCS file: /export/home/cvs/apachen/src/main/http_main.c,v retrieving revision 1.232 retrieving revision 1.233 diff -u -r1.232 -r1.233 --- http_main.c 1997/10/05 08:12:45 1.232 +++ http_main.c 1997/10/06 15:05:32 1.233 @@ -1710,11 +1710,11 @@ return -1; } -static void reclaim_child_processes(int start_tries) +static void reclaim_child_processes(int terminate) { #ifndef MULTITHREAD int i, status; - long int waittime = 4096; /* in usecs */ + long int waittime = 1024 * 16; /* in usecs */ struct timeval tv; int waitret, tries; int not_dead_yet; @@ -1724,17 +1724,14 @@ sync_scoreboard_image(); - tries = 0; - for (tries = start_tries; tries < 4; ++tries) { + for (tries = terminate ? 4 : 1; tries <= 9; ++tries) { /* don't want to hold up progress any more than * necessary, but we need to allow children a few moments to exit. - * delay with an exponential backoff. - * Currently set for a maximum wait of a bit over - * four seconds. + * Set delay with an exponential backoff. */ tv.tv_sec = waittime / 1000000; tv.tv_usec = waittime % 1000000; - waittime = waittime * 2; + waittime = waittime * 4; ap_select(0, NULL, NULL, NULL, &tv); /* now see who is done */ @@ -1752,28 +1749,38 @@ } ++not_dead_yet; switch (tries) { - case 1: + case 1: /* 16ms */ + case 2: /* 82ms */ + break; + case 3: /* 344ms */ /* perhaps it missed the SIGHUP, lets try again */ - aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_ERR, server_conf, + aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_WARNING, + server_conf, "child process %d did not exit, sending another SIGHUP", pid); kill(pid, SIGHUP); + waittime = 1024 * 16; break; - case 2: + case 4: /* 16ms */ + case 5: /* 82ms */ + case 6: /* 344ms */ + break; + case 7: /* 1.4sec */ /* ok, now it's being annoying */ - aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_ERR, server_conf, + aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_WARNING, + server_conf, "child process %d still did not exit, sending a SIGTERM", pid); kill(pid, SIGTERM); break; - case 3: + case 8: /* 6 sec */ /* die child scum */ aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_ERR, server_conf, "child process %d still did not exit, sending a SIGKILL", pid); kill(pid, SIGKILL); break; - case 4: + case 9: /* 14 sec */ /* gave it our best shot, but alas... If this really * is a child we are trying to kill and it really hasn't * exited, we will likely fail to bind to the port @@ -3497,7 +3504,7 @@ if (ap_killpg(pgrp, SIGTERM) < 0) { aplog_error(APLOG_MARK, APLOG_WARNING, server_conf, "killpg SIGTERM"); } - reclaim_child_processes(2); /* Start with SIGTERM */ + reclaim_child_processes(1); /* Start with SIGTERM */ aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_NOTICE, server_conf, "httpd: caught SIGTERM, shutting down"); @@ -3554,7 +3561,7 @@ if (ap_killpg(pgrp, SIGHUP) < 0) { aplog_error(APLOG_MARK, APLOG_WARNING, server_conf, "killpg SIGHUP"); } - reclaim_child_processes(1); /* Not when just starting up */ + reclaim_child_processes(0); /* Not when just starting up */ aplog_error(APLOG_MARK, APLOG_NOERRNO|APLOG_NOTICE, server_conf, "SIGHUP received. Attempting to restart"); }