On Tue, Aug 30, 2016 at 8:23 AM, Petr Cech <pc...@redhat.com> wrote: > On 08/15/2016 02:58 PM, Fabiano Fidêncio wrote: >> >> Those 3 patches are from Jakub and I've just done some minor >> adjustments and add myself as co-author of the first 2 patches. >> >> CI has passed: http://sssd-ci.duckdns.org/logs/job/51/55/summary.html >> >> Best Regards, >> -- >> Fabiano Fidêncio > > > Hello, > > CI passed: > http://sssd-ci.duckdns.org/logs/job/52/71/summary.html > >> 0001-MONITOR-Remove-the-no-longer-used-diag_cmd-command.patch >> >> >> From aa6204816cde0a7d75b9303916d038ed06e467ba Mon Sep 17 00:00:00 2001 >> From: Jakub Hrozek <jhro...@redhat.com> >> Date: Sun, 8 May 2016 14:41:35 +0200 >> Subject: [PATCH 1/3] MONITOR: Remove the no longer used diag_cmd command >> MIME-Version: 1.0 >> Content-Type: text/plain; charset=UTF-8 >> Content-Transfer-Encoding: 8bit >> >> After introducing the watchdog, the diag_cmd is longer used and makes no >> sense trying to make it usable by watchdog as the result of "pstack %p" >> seems next to useless in this context. >> >> Co-author: Fabiano Fidêncio <fiden...@redhat.com> >> >> Related: >> https://fedorahosted.org/sssd/ticket/3051 >> --- > > > ACK > > >> 0002-MONITOR-Remove-the-no-longer-used-kill_service-comma.patch >> >> >> From 7954e0254752d0a830a0501f23a6a93d0345e5ce Mon Sep 17 00:00:00 2001 >> From: Jakub Hrozek <jhro...@redhat.com> >> Date: Sun, 8 May 2016 14:46:25 +0200 >> Subject: [PATCH 2/3] MONITOR: Remove the no longer used kill_service >> command >> MIME-Version: 1.0 >> Content-Type: text/plain; charset=UTF-8 >> Content-Transfer-Encoding: 8bit >> >> After introducing the watchdog, the force_timeout option is no longer >> used. >> >> Co-author: Fabiano Fidêncio <fiden...@redhat.com> >> >> Resolves: >> https://fedorahosted.org/sssd/ticket/3052 >> --- > > > ACK > > >> 0003-WATCHDOG-define-and-use-_MAX_TICKS-as-3.patch >> >> >> From 1302c5a95ac36dd674c8795cda0082b84d30978d Mon Sep 17 00:00:00 2001 >> From: Jakub Hrozek <jhro...@redhat.com> >> Date: Mon, 15 Aug 2016 12:54:20 +0200 >> Subject: [PATCH 3/3] WATCHDOG: define and use _MAX_TICKS as 3 >> >> Instead of using the number 3 directly, let's introduce and use >> WATCHDOG_MAX_TICKS. >> -- > > > This patch is unfortunately inapplicable on top of master > (after two previous patches): > > pcech@albireo ~/sssd: (master) $ git am > ../patch/0003-WATCHDOG-define-and-use-_MAX_TICKS-as-3.patch > Applying: WATCHDOG: define and use _MAX_TICKS as 3 > error: patch failed: src/util/util_watchdog.c:38 > error: src/util/util_watchdog.c: patch does not apply > Patch failed at 0001 WATCHDOG: define and use _MAX_TICKS as 3 > > Regards
Rebase was quite simple. See the v2 attached (the only change in v2 was the rebase). > > --- > > Petr^4 Čech > _______________________________________________ > sssd-devel mailing list > sssd-devel@lists.fedorahosted.org > https://lists.fedorahosted.org/admin/lists/sssd-devel@lists.fedorahosted.org
From 7579cf9982c86978500e9249ad3e82124867fc90 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek <jhro...@redhat.com> Date: Sun, 8 May 2016 14:41:35 +0200 Subject: [PATCH v2 1/3] MONITOR: Remove the no longer used diag_cmd command After introducing the watchdog, the diag_cmd is longer used and makes no sense trying to make it usable by watchdog as the result of "pstack %p" seems next to useless in this context. Related: https://fedorahosted.org/sssd/ticket/3051 --- src/confdb/confdb.h | 1 - src/monitor/monitor.c | 163 -------------------------------------------------- 2 files changed, 164 deletions(-) diff --git a/src/confdb/confdb.h b/src/confdb/confdb.h index 72adbd8..58a085b 100644 --- a/src/confdb/confdb.h +++ b/src/confdb/confdb.h @@ -73,7 +73,6 @@ #define CONFDB_MONITOR_DEFAULT_DOMAIN "default_domain_suffix" #define CONFDB_MONITOR_OVERRIDE_SPACE "override_space" #define CONFDB_MONITOR_USER_RUNAS "user" -#define CONFDB_MONITOR_PRE_KILL_CMD "diag_cmd" #define CONFDB_MONITOR_CERT_VERIFICATION "certificate_verification" /* Both monitor and domains */ diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index 7a9ef56..f97b2a9 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -112,7 +112,6 @@ struct mt_svc { char *identity; pid_t pid; - char *diag_cmd; int kill_time; struct tevent_timer *kill_timer; @@ -373,77 +372,6 @@ static int add_svc_conn_spy(struct mt_svc *svc) return EOK; } -static char *expand_diag_cmd(struct mt_svc *svc, - const char *template) -{ - TALLOC_CTX *tmp_ctx = NULL; - char *copy; - char *p_copy; - char *n; - char *result = NULL; - char action; - char *res = NULL; - - if (template == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, "Missing template.\n"); - return NULL; - } - - tmp_ctx = talloc_new(NULL); - if (!tmp_ctx) return NULL; - - copy = talloc_strdup(tmp_ctx, template); - if (copy == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n"); - goto done; - } - - result = talloc_strdup(tmp_ctx, ""); - if (result == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n"); - goto done; - } - - p_copy = copy; - while ((n = strchr(p_copy, '%')) != NULL) { - *n = '\0'; - n++; - if ( *n == '\0' ) { - DEBUG(SSSDBG_CRIT_FAILURE, - "format error, single %% at the end of the template.\n"); - goto done; - } - - action = *n; - switch (action) { - case 'p': - result = talloc_asprintf_append(result, "%s%d", p_copy, svc->pid); - break; - default: - DEBUG(SSSDBG_CRIT_FAILURE, - "format error, unknown template [%%%c].\n", *n); - goto done; - } - - if (result == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n"); - goto done; - } - - p_copy = n + 1; - } - - result = talloc_asprintf_append(result, "%s", p_copy); - if (result == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n"); - goto done; - } - - res = talloc_move(svc, &result); -done: - talloc_zfree(tmp_ctx); - return res; -} static void svc_child_info(struct mt_svc *svc, int wait_status) { @@ -467,82 +395,6 @@ static void svc_child_info(struct mt_svc *svc, int wait_status) } } -static void svc_diag_cmd_exit_handler(int pid, int wait_status, void *pvt) -{ - struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc); - - svc_child_info(svc, wait_status); -} - -static void svc_run_diag_cmd(struct mt_svc *svc) -{ - pid_t pkc_pid; - char **args; - int ret; - int debug_fd; - char *diag_cmd; - struct sss_child_ctx *diag_child_ctx; - - if (svc->diag_cmd == NULL) { - return; - } - - pkc_pid = fork(); - if (pkc_pid != 0) { - /* parent, schedule SIGKILL */ - - ret = sss_child_register(svc, - svc->mt_ctx->sigchld_ctx, - pkc_pid, - svc_diag_cmd_exit_handler, - svc, - &diag_child_ctx); - if (ret != EOK) { - DEBUG(SSSDBG_CRIT_FAILURE, "Cannot register child %d\n", pkc_pid); - /* Try to go on ... */ - } - - return; - } - - /* child, execute diagnostics */ - diag_cmd = expand_diag_cmd(svc, svc->diag_cmd); - if (diag_cmd == NULL) { - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to expand [%s]\n", svc->diag_cmd); - _exit(1); - } - - if (debug_level >= SSSDBG_TRACE_LIBS) { - debug_fd = get_fd_from_debug_file(); - ret = dup2(debug_fd, STDERR_FILENO); - if (ret == -1) { - ret = errno; - DEBUG(SSSDBG_MINOR_FAILURE, - "dup2 failed for stderr [%d][%s].\n", ret, sss_strerror(ret)); - /* failure to redirect stderr is not fatal */ - } - - ret = dup2(debug_fd, STDOUT_FILENO); - if (ret == -1) { - ret = errno; - DEBUG(SSSDBG_MINOR_FAILURE, - "dup2 failed for stdout [%d][%s].\n", ret, sss_strerror(ret)); - /* failure to redirect stdout is not fatal */ - } - } - - args = parse_args(diag_cmd); - execvp(args[0], args); - - /* If we are here, exec() has failed - * Print errno and abort quickly */ - ret = errno; - DEBUG(SSSDBG_FATAL_FAILURE, - "Could not exec %s, reason: %s\n", svc->diag_cmd, strerror(ret)); - _exit(1); -} - static int mark_service_as_started(struct mt_svc *svc) { struct mt_ctx *ctx = svc->mt_ctx; @@ -712,8 +564,6 @@ static int monitor_kill_service (struct mt_svc *svc) return EOK; } - svc_run_diag_cmd(svc); - /* Set up a timer to send SIGKILL if this process * doesn't exit within the configured interval */ @@ -1147,19 +997,6 @@ static errno_t get_kill_config(struct mt_ctx *ctx, const char *path, { errno_t ret; - ret = confdb_get_string(ctx->cdb, svc, path, - CONFDB_MONITOR_PRE_KILL_CMD, - NULL, &svc->diag_cmd); - if (ret != EOK) { - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to get diagnostics command for %s\n", svc->name); - return ret; - } - if (svc->diag_cmd) { - DEBUG(SSSDBG_CONF_SETTINGS, - "Diagnostics command: [%s]\n", svc->diag_cmd); - } - ret = confdb_get_int(ctx->cdb, path, CONFDB_SERVICE_FORCE_TIMEOUT, MONITOR_DEF_FORCE_TIME, &svc->kill_time); -- 2.7.4
From ac35fe7430d628677f685abbb66c799e160d1527 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek <jhro...@redhat.com> Date: Sun, 8 May 2016 14:46:25 +0200 Subject: [PATCH v2 2/3] MONITOR: Remove the no longer used kill_service command After introducing the watchdog, the force_timeout option is no longer used. Resolves: https://fedorahosted.org/sssd/ticket/3052 --- src/confdb/confdb.h | 1 - src/man/sssd.conf.5.xml | 33 ------------ src/monitor/monitor.c | 141 ------------------------------------------------ 3 files changed, 175 deletions(-) diff --git a/src/confdb/confdb.h b/src/confdb/confdb.h index 58a085b..401e5fb 100644 --- a/src/confdb/confdb.h +++ b/src/confdb/confdb.h @@ -58,7 +58,6 @@ #define CONFDB_SERVICE_DEBUG_TIMESTAMPS "debug_timestamps" #define CONFDB_SERVICE_DEBUG_MICROSECONDS "debug_microseconds" #define CONFDB_SERVICE_DEBUG_TO_FILES "debug_to_files" -#define CONFDB_SERVICE_FORCE_TIMEOUT "force_timeout" #define CONFDB_SERVICE_RECON_RETRIES "reconnection_retries" #define CONFDB_SERVICE_FD_LIMIT "fd_limit" #define CONFDB_SERVICE_ALLOWED_UIDS "allowed_uids" diff --git a/src/man/sssd.conf.5.xml b/src/man/sssd.conf.5.xml index e95a7e7..ae291e0 100644 --- a/src/man/sssd.conf.5.xml +++ b/src/man/sssd.conf.5.xml @@ -549,22 +549,6 @@ </listitem> </varlistentry> <varlistentry> - <term>force_timeout (integer)</term> - <listitem> - <para> - If a service is not responding to ping checks (see - the <quote>timeout</quote> option), it is first sent - the SIGTERM signal that instructs it to quit gracefully. - If the service does not terminate after <quote>force_timeout</quote> - seconds, the monitor will forcibly shut it down by - sending a SIGKILL signal. - </para> - <para> - Default: 60 - </para> - </listitem> - </varlistentry> - <varlistentry> <term>offline_timeout (integer)</term> <listitem> <para> @@ -1453,23 +1437,6 @@ pam_account_locked_message = Account locked, please contact help desk. </varlistentry> <varlistentry> - <term>force_timeout (integer)</term> - <listitem> - <para> - If a service is not responding to ping checks (see - the <quote>timeout</quote> option), it is first sent - the SIGTERM signal that instructs it to quit gracefully. - If the service does not terminate after <quote>force_timeout</quote> - seconds, the monitor will forcibly shut it down by - sending a SIGKILL signal. - </para> - <para> - Default: 60 - </para> - </listitem> - </varlistentry> - - <varlistentry> <term>entry_cache_timeout (integer)</term> <listitem> <para> diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c index f97b2a9..1f89c5a 100644 --- a/src/monitor/monitor.c +++ b/src/monitor/monitor.c @@ -114,8 +114,6 @@ struct mt_svc { int kill_time; - struct tevent_timer *kill_timer; - bool svc_started; int restarts; @@ -176,8 +174,6 @@ static int monitor_service_init(struct sbus_connection *conn, void *data); static int service_signal_reset_offline(struct mt_svc *svc); -static int monitor_kill_service (struct mt_svc *svc); - static int get_service_config(struct mt_ctx *ctx, const char *name, struct mt_svc **svc_cfg); static int get_provider_config(struct mt_ctx *ctx, const char *name, @@ -542,95 +538,6 @@ static int monitor_dbus_init(struct mt_ctx *ctx) } static void monitor_restart_service(struct mt_svc *svc); -static void mt_svc_sigkill(struct tevent_context *ev, - struct tevent_timer *te, - struct timeval t, void *ptr); -static int monitor_kill_service (struct mt_svc *svc) -{ - int ret; - struct timeval tv; - - ret = kill(svc->pid, SIGTERM); - if (ret == -1) { - ret = errno; - DEBUG(SSSDBG_FATAL_FAILURE, - "Sending signal to child (%s:%d) failed: [%d]: %s! " - "Ignore and pretend child is dead.\n", - svc->name, svc->pid, ret, strerror(ret)); - /* The only thing we can try here is to launch a new process - * and hope that it works. - */ - monitor_restart_service(svc); - return EOK; - } - - /* Set up a timer to send SIGKILL if this process - * doesn't exit within the configured interval - */ - tv = tevent_timeval_current_ofs(svc->kill_time, 0); - svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev, - svc, - tv, - mt_svc_sigkill, - svc); - if (svc->kill_timer == NULL) { - /* Nothing much we can do */ - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to allocate timed event: mt_svc_sigkill.\n"); - /* We'll just have to hope that the SIGTERM succeeds */ - } - return EOK; -} - -static void mt_svc_sigkill(struct tevent_context *ev, - struct tevent_timer *te, - struct timeval t, void *ptr) -{ - int ret; - struct mt_svc *svc = talloc_get_type(ptr, struct mt_svc); - - DEBUG(SSSDBG_FATAL_FAILURE, - "[%s][%d] is not responding to SIGTERM. Sending SIGKILL.\n", - svc->name, svc->pid); - sss_log(SSS_LOG_ERR, - "[%s][%d] is not responding to SIGTERM. Sending SIGKILL.\n", - svc->name, svc->pid); - - /* timer was succesfully executed and it will be released by tevent */ - svc->kill_timer = NULL; - - ret = kill(svc->pid, SIGKILL); - if (ret != EOK) { - ret = errno; - DEBUG(SSSDBG_FATAL_FAILURE, - "Sending signal to child (%s:%d) failed! " - "Ignore and pretend child is dead.\n", - svc->name, svc->pid); - - if (ret == ESRCH) { - /* The process doesn't exist - * This most likely means we hit a race where - * the SIGTERM concluded just after the timer - * fired but before we called kill() here. - * We'll just do nothing, since the - * mt_svc_exit_handler() should be doing the - * necessary work. - */ - return; - } - - /* Something went really wrong. - * The only thing we can try here is to launch a new process - * and hope that it works. - */ - monitor_restart_service(svc); - } - - /* The process should terminate immediately and then be - * restarted by the mt_svc_exit_handler() - */ - return; -} static void reload_reply(DBusPendingCall *pending, void *data) { @@ -708,7 +615,6 @@ static int service_signal(struct mt_svc *svc, const char *svc_signal) DEBUG(SSSDBG_FATAL_FAILURE, "Out of memory trying to allocate memory to invoke: %s\n", svc_signal); - monitor_kill_service(svc); return ENOMEM; } @@ -992,32 +898,6 @@ static int get_monitor_config(struct mt_ctx *ctx) return EOK; } -static errno_t get_kill_config(struct mt_ctx *ctx, const char *path, - struct mt_svc *svc) -{ - errno_t ret; - - ret = confdb_get_int(ctx->cdb, path, - CONFDB_SERVICE_FORCE_TIMEOUT, - MONITOR_DEF_FORCE_TIME, &svc->kill_time); - if (ret != EOK) { - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to get kill timeout for %s\n", svc->name); - return ret; - } - - /* 'force_timeout = 0' should be translated to the default */ - if (svc->kill_time == 0) { - svc->kill_time = MONITOR_DEF_FORCE_TIME; - } - - DEBUG(SSSDBG_CONF_SETTINGS, - "Time between SIGTERM and SIGKILL for [%s]: [%d]\n", - svc->name, svc->kill_time); - - return EOK; -} - /* This is a temporary function that returns false if the service * being started was only tested when running as root. */ @@ -1154,14 +1034,6 @@ static int get_service_config(struct mt_ctx *ctx, const char *name, } } - ret = get_kill_config(ctx, path, svc); - if (ret != EOK) { - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to get kill timeouts for %s\n", svc->name); - talloc_free(svc); - return ret; - } - svc->last_restart = now; *svc_cfg = svc; @@ -1249,14 +1121,6 @@ static int get_provider_config(struct mt_ctx *ctx, const char *name, return ret; } - ret = get_kill_config(ctx, path, svc); - if (ret != EOK) { - DEBUG(SSSDBG_CRIT_FAILURE, - "Failed to get kill timeouts for %s\n", svc->name); - talloc_free(svc); - return ret; - } - talloc_free(path); /* if no provider is present do not run the domain */ @@ -2540,11 +2404,6 @@ static void mt_svc_exit_handler(int pid, int wait_status, void *pvt) "SIGCHLD handler of service %s called\n", svc->name); svc_child_info(svc, wait_status); - /* Clear the kill_timer so we don't try to SIGKILL it after it's - * already gone. - */ - talloc_zfree(svc->kill_timer); - /* Check the number of restart tries and relaunch the service */ monitor_restart_service(svc); -- 2.7.4
From 31b90c52debcdba7d26544a7478a57e6289e2187 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek <jhro...@redhat.com> Date: Mon, 15 Aug 2016 12:54:20 +0200 Subject: [PATCH v2 3/3] WATCHDOG: define and use _MAX_TICKS as 3 Instead of using the number 3 directly, let's introduce and use WATCHDOG_MAX_TICKS. --- src/util/util_watchdog.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c index 1c27d73..c184fbd 100644 --- a/src/util/util_watchdog.c +++ b/src/util/util_watchdog.c @@ -22,6 +22,7 @@ #include "util/util.h" #define WATCHDOG_DEF_INTERVAL 10 +#define WATCHDOG_MAX_TICKS 3 /* this is intentionally a global variable */ struct watchdog_ctx { @@ -75,9 +76,8 @@ static void watchdog_handler(int sig) return; } - /* if 3 ticks passed by kills itself */ - - if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > 3) { + /* if a pre-defined number of ticks passed by kills itself */ + if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > WATCHDOG_MAX_TICKS) { DEBUG(SSSDBG_FATAL_FAILURE, "Watchdog timer overflow, killing process!\n"); orderly_shutdown(1); -- 2.7.4
_______________________________________________ sssd-devel mailing list sssd-devel@lists.fedorahosted.org https://lists.fedorahosted.org/admin/lists/sssd-devel@lists.fedorahosted.org