On Wed, 2014-12-10 at 14:59 -0500, Stephen Gallagher wrote: > There are actually two bugs here: > > 1) When either the kill(SIGTERM) or kill(SIGKILL) commands returned > failure (for any reason), we would talloc_free(svc) which removed it > from being eligible for restart, resulting in the service never > starting again without an SSSD service restart. > > 2) There is a fairly wide race condition where it's possible for a > SIGKILL timer to "catch up" to the child exit handler between us > noticing the termination and actually restarting it. The race > happens because we re-enter the mainloop and add a restart > timeout to avoid a quick failure if we keep restarting due to a > transitory issue (the mt_svc object, and therefore the SIGKILL > timer, were never freed until we got to the actual service > restart). > > We can minimize this race by recording the timer_event for the > SIGKILL timeout in the mt_svc object. This way, if the process > exits via SIGTERM, we will immediately remove the timer for the > SIGKILL. > > This patch also removes the incorrect talloc_free(svc) calls on the > kill() failures and replaces them with an attempt to just start up > the service again and hope for the best. > > Resolves: > https://fedorahosted.org/sssd/ticket/2525 Just after sending this, I noticed another enhancement I could make to basically eliminate the potential race condition. Updated patch attached.
From cc21a1fe1dcb39e0cf5287a83a26b5880f1a701d Mon Sep 17 00:00:00 2001 From: Stephen Gallagher <[email protected]> Date: Wed, 10 Dec 2014 14:16:49 -0500 Subject: [PATCH] monitor: Service restart fixes There are actually two bugs here: 1) When either the kill(SIGTERM) or kill(SIGKILL) commands returned failure (for any reason), we would talloc_free(svc) which removed it from being eligible for restart, resulting in the service never starting again without an SSSD service restart. 2) There is a fairly wide race condition where it's possible for a SIGKILL timer to "catch up" to the child exit handler between us noticing the termination and actually restarting it. The race happens because we re-enter the mainloop and add a restart timeout to avoid a quick failure if we keep restarting due to a transitory issue (the mt_svc object, and therefore the SIGKILL timer, were never freed until we got to the actual service restart). We can minimize this race by recording the timer_event for the SIGKILL timeout in the mt_svc object. This way, if the process exits via SIGTERM, we will immediately remove the timer for the SIGKILL. Additionally, we'll catch the special-case of an ESRCH response from the kill(SIGKILL) and assume that it means that the process has exited. The only other two possible errors are * EINVAL: (an invalid signal was specified) - This should be impossible, obviously. * EPERM: This process doesn't have permission to send signals to this PID. If this happens, it's either an SELinux bug or else the process has terminated and a new process that SSSD doesn't control has taken the ID over. So in the incredibly unlikely case that one of those occurs, we'll just go ahead and try to start a new process. This patch also removes the incorrect talloc_free(svc) calls on the kill() failures and replaces them with an attempt to just start up the service again and hope for the best. Resolves: https://fedorahosted.org/sssd/ticket/2525 --- src/monitor/monitor.c | 94 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 20 deletions(-) diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index c6834a1159b393bedd7251d9c3d28d3326cdc547..9a6f58d67e7d5a05bc01803bf86bb959f6ede70e 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -117,10 +117,12 @@ struct mt_svc { pid_t pid; int ping_time; int kill_time; + struct tevent_timer *kill_timer; + bool svc_started; int restarts; time_t last_restart; int failed_pongs; @@ -586,49 +588,50 @@ static void set_tasks_checker(struct mt_svc *svc) /* FIXME: shutdown ? */ } svc->ping_ev = te; } +static void monitor_restart_service(struct mt_svc *svc); static void mt_svc_sigkill(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr); static int monitor_kill_service (struct mt_svc *svc) { int ret; struct timeval tv; - struct tevent_timer *te; ret = kill(svc->pid, SIGTERM); if (ret == -1) { ret = errno; DEBUG(SSSDBG_FATAL_FAILURE, "Sending signal to child (%s:%d) failed: [%d]: %s! " "Ignore and pretend child is dead.\n", svc->name, svc->pid, ret, strerror(ret)); - goto done; + /* The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); + return EOK; } /* Set up a timer to send SIGKILL if this process * doesn't exit within sixty seconds */ tv = tevent_timeval_current_ofs(svc->kill_time, 0); - te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, mt_svc_sigkill, svc); - if (te == NULL) { + svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev, + svc, + tv, + mt_svc_sigkill, + svc); + if (svc->kill_timer == NULL) { /* Nothing much we can do */ - ret = ENOMEM; DEBUG(SSSDBG_CRIT_FAILURE, "Failed to allocate timed event: mt_svc_sigkill.\n"); - goto done; + /* We'll just have to hope that the SIGTERM succeeds */ } - ret = EOK; - -done: - if (ret != EOK) { - talloc_free(svc); - } - return ret; + return EOK; } static void mt_svc_sigkill(struct tevent_context *ev, struct tevent_timer *te, struct timeval t, void *ptr) @@ -643,16 +646,39 @@ static void mt_svc_sigkill(struct tevent_context *ev, "[%s][%d] is not responding to SIGTERM. Sending SIGKILL.\n", svc->name, svc->pid); ret = kill(svc->pid, SIGKILL); if (ret != EOK) { + ret = errno; DEBUG(SSSDBG_FATAL_FAILURE, "Sending signal to child (%s:%d) failed! " "Ignore and pretend child is dead.\n", svc->name, svc->pid); - talloc_free(svc); + + if (ret == ESRCH) { + /* The process doesn't exist + * This most likely means we hit a race where + * the SIGTERM concluded just after the timer + * fired but before we called kill() here. + * We'll just do nothing, since the + * mt_svc_exit_handler() should be doing the + * necessary work. + */ + return; + } + + /* Something went really wrong. + * The only thing we can try here is to launch a new process + * and hope that it works. + */ + monitor_restart_service(svc); } + + /* The process should terminate immediately and then be + * restarted by the mt_svc_exit_handler() + */ + return; } static void reload_reply(DBusPendingCall *pending, void *data) { DBusMessage *reply; @@ -2697,15 +2723,11 @@ static void mt_svc_restart(struct tevent_context *ev, } static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) { struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); - struct mt_ctx *mt_ctx = svc->mt_ctx; - time_t now = time(NULL); - struct tevent_timer *te; - struct timeval tv; - int restart_delay; + if (WIFEXITED(wait_status)) { DEBUG(SSSDBG_OP_FAILURE, "Child [%s] exited with code [%d]\n", svc->name, WEXITSTATUS(wait_status)); @@ -2723,21 +2745,53 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) * call to the SIGCHLD handler */ return; } + /* Clear the kill_timer so we don't try to SIGKILL it after it's + * already gone. + */ + talloc_zfree(svc->kill_timer); + + /* Check the number of restart tries and relaunch the service */ + monitor_restart_service(svc); + + return; +} + +static void monitor_restart_service(struct mt_svc *svc) +{ + struct mt_ctx *mt_ctx = svc->mt_ctx; + int restart_delay; + time_t now = time(NULL); + struct tevent_timer *te; + struct timeval tv; + + /* Handle the actual checks for how many times to restart this + * service before giving up. + */ if ((now - svc->last_restart) > MONITOR_RESTART_CNT_INTERVAL_RESET) { svc->restarts = 0; } /* Restart the service */ if (svc->restarts > MONITOR_MAX_SVC_RESTARTS) { DEBUG(SSSDBG_FATAL_FAILURE, "Process [%s], definitely stopped!\n", svc->name); + + sss_log(SSS_LOG_ERR, + "Exiting the SSSD. Could not restart critical service [%s].", + svc->name); + talloc_free(svc); - /* exit with error */ + /* exit the SSSD with an error, shutting down all + * services and domains. + * We do this because if one of the responders is down + * and can't come back up, this is the only way to + * guarantee admin intervention. + */ monitor_quit(mt_ctx, 1); return; } /* restarts are schedule after 0, 2, 4 seconds */ -- 2.1.0
signature.asc
Description: This is a digitally signed message part
_______________________________________________ sssd-devel mailing list [email protected] https://lists.fedorahosted.org/mailman/listinfo/sssd-devel
