On 11/11/2015 12:25 PM, Jakub Hrozek wrote:
On Wed, Nov 11, 2015 at 11:07:46AM +0100, Petr Cech wrote:
>On 11/11/2015 09:32 AM, Jakub Hrozek wrote:
> >>>Hi Jakub,
> >>>
> >>>I just sent the patch to the CI tests and they passed
> >>>http://sssd-ci.duckdns.org/logs/job/32/63/summary.html
> >>>
> >>>> >Then I would prefer undocumented. It matches how we (don't) document the
> >>>> >"command" option.
> >>>I agree with little exception. I think it could be more clear if we write
> >>>little documentation to commit message or to the code near the new option.
> >>>But, how everybody wrote, we could leave man page without documentation.
> >Good idea, I can move the documentation that was previously in the man
> >page to the commit message, would that work for you?
>
>Yes, it works for me.
>
>I am going to 1/2 PTO for now.
>
>The last two things are that I would like to run it due to your reproducer.
>And there is a remark from Stephen Gallagher which we need resolve.
>
>However, CI tests passed. So if you're in a hurry with code review, please
>ask someone else.
>
>I can continue with review tomorrow.
I'm not in hurry at all. Attached is a patch that adds a better commit
message. We can discuss any details related to testing over IRC if you
prefer.
Hi Jakub,

it works due to your reproducer. It is really need to have
setenforce == 1

CI tests passed:
http://sssd-ci.duckdns.org/logs/job/32/25/summary.html

Stephen Gallagher wrote (2015-09-11 11:32 AM):
There are problems inherent with passing the PID to the child process.
There's no guarantee that the process still exists. In the worst-case,
the PID could actually be reassigned to a new process and the output
you got back from something like pstack could be reading from a
different executable entirely.
---
I understand, it could be dangerous. But, this option is a little bit secret, we don't write about it in our man pages and so on. I hope it will be used only for debuging some hot cases.

There are only little remarks in patch.

=> ACK

Regrds

Petr

PS: I accepted that we have
# p = copy;
not something like:
# copy_ptr = copy;
How I suggested previous mail.

0001-SSSD-Add-a-new-command-diag_cmd.patch


 From ee4135adf6669221de575ebc92e7b3aabba55ba9 Mon Sep 17 00:00:00 2001
From: Jakub Hrozek<jhro...@redhat.com>
Date: Mon, 2 Nov 2015 11:41:31 +0100
Subject: [PATCH] SSSD: Add a new command diag_cmd

This command is an optional one that is run when a sbus ping times out
and before a SIGKILL commans is sent.
                          ---^--- s -> d

diag_cmd (string):
A command that should be run for diagnostic purpose when an sbus timeout
fails. The option value may contain %p which would be expanded for the
process ID of the process that timed out

Example:
         pstack %p
This setting would print the stackstrace of the command whose ping timed out.

Default: not set.
---
  src/confdb/confdb.h                  |   1 +
  src/config/SSSDConfig/__init__.py.in |   1 +
  src/config/SSSDConfigTest.py         |   1 +
  src/config/etc/sssd.api.conf         |   1 +
  src/monitor/monitor.c                | 215 +++++++++++++++++++++++++++++++----
  5 files changed, 197 insertions(+), 22 deletions(-)

diff --git a/src/confdb/confdb.h b/src/confdb/confdb.h
index 
37b5fd7c7629e2618a1699e3ffd58110171db605..0ef7268f9cdc2c18482bbf7b8dbe19d3ef6b7bbf
 100644
--- a/src/confdb/confdb.h
+++ b/src/confdb/confdb.h
@@ -71,6 +71,7 @@
  #define CONFDB_MONITOR_DEFAULT_DOMAIN "default_domain_suffix"
  #define CONFDB_MONITOR_OVERRIDE_SPACE "override_space"
  #define CONFDB_MONITOR_USER_RUNAS "user"
+#define CONFDB_MONITOR_PRE_KILL_CMD "diag_cmd"

  /* Both monitor and domains */
  #define CONFDB_NAME_REGEX   "re_expression"
diff --git a/src/config/SSSDConfig/__init__.py.in 
b/src/config/SSSDConfig/__init__.py.in
index 
bf61c402796122050fa43cf41128faec4771c5d2..60129e6e7fbc96d11c539323346c22a7db6d7f23
 100644
--- a/src/config/SSSDConfig/__init__.py.in
+++ b/src/config/SSSDConfig/__init__.py.in
@@ -50,6 +50,7 @@ option_strings = {
      'reconnection_retries' : _('Number of times to attempt connection to Data 
Providers'),
      'fd_limit' : _('The number of file descriptors that may be opened by this 
responder'),
      'client_idle_timeout' : _('Idle time before automatic disconnection of a 
client'),
+    'diag_cmd' : _('The command to run when a service ping times out'),

      # [sssd]
      'services' : _('SSSD Services to start'),
diff --git a/src/config/SSSDConfigTest.py b/src/config/SSSDConfigTest.py
index 
45562214da5d227b45914abbcb298e043048adf5..abd4a39258e060f27db62eb2352450b6c405930c
 100755
--- a/src/config/SSSDConfigTest.py
+++ b/src/config/SSSDConfigTest.py
@@ -307,6 +307,7 @@ class SSSDConfigTestSSSDService(unittest.TestCase):
              'reconnection_retries',
              'fd_limit',
              'client_idle_timeout',
+            'diag_cmd',
              'description']

          self.assertTrue(type(options) == dict,
diff --git a/src/config/etc/sssd.api.conf b/src/config/etc/sssd.api.conf
index 
72abb8b3f427e9bcf3cf3dcaa67aaf4e3e50226c..0c03625bd5f0d3fbf8948971b015d4e8255f0009
 100644
--- a/src/config/etc/sssd.api.conf
+++ b/src/config/etc/sssd.api.conf
@@ -13,6 +13,7 @@ fd_limit = int, None, false
  client_idle_timeout = int, None, false
  force_timeout = int, None, false
  description = str, None, false
+diag_cmd = str, None, false

  [sssd]
  # Monitor service
diff --git a/src/monitor/monitor.c b/src/monitor/monitor.c
index 
89ac882d38652e071fdc619495c3252807527da0..5ff5f85527a6aa91e220ddd07e8c8738dba0e829
 100644
--- a/src/monitor/monitor.c
+++ b/src/monitor/monitor.c
@@ -116,6 +116,7 @@ struct mt_svc {
      char *identity;
      pid_t pid;

+    char *diag_cmd;
      int ping_time;
      int kill_time;

@@ -383,6 +384,176 @@ static int add_svc_conn_spy(struct mt_svc *svc)
      return EOK;
  }

+static char *expand_diag_cmd(struct mt_svc *svc,
+                             const char *template)
+{
+    TALLOC_CTX *tmp_ctx = NULL;
+    char *copy;
+    char *p;
+    char *n;
+    char *result = NULL;
+    char action;
+    char *res = NULL;
+
+    if (template == NULL) {
+        DEBUG(SSSDBG_CRIT_FAILURE, "Missing template.\n");
+        return NULL;
+    }
+
+    tmp_ctx = talloc_new(NULL);
+    if (!tmp_ctx) return NULL;
+
+    copy = talloc_strdup(tmp_ctx, template);
+    if (copy == NULL) {
+        DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n");
+        goto done;
+    }
+
+    result = talloc_strdup(tmp_ctx, "");
+    if (result == NULL) {
+        DEBUG(SSSDBG_CRIT_FAILURE, "talloc_strdup failed.\n");
+        goto done;
+    }
+
+    p = copy;
+    while ((n = strchr(p, '%')) != NULL) {
+        *n = '\0';
+        n++;
+        if ( *n == '\0' ) {
+            DEBUG(SSSDBG_CRIT_FAILURE,
+                  "format error, single %% at the end of the template.\n");
+            goto done;
+        }
+
+        action = *n;
+        switch (action) {
+        case 'p':
+            result = talloc_asprintf_append(result, "%s%d", p, svc->pid);
+            break;
+        default:
+            DEBUG(SSSDBG_CRIT_FAILURE,
+                  "format error, unknown template [%%%c].\n", *n);
+            goto done;
+        }
+
+        if (result == NULL) {
+            DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n");
+            goto done;
+        }
+
+        p = n + 1;
+    }
+
+    result = talloc_asprintf_append(result, "%s", p);
+    if (result == NULL) {
+        DEBUG(SSSDBG_CRIT_FAILURE, "talloc_asprintf_append failed.\n");
+        goto done;
+    }
+
+    res = talloc_move(svc, &result);
+done:
+    talloc_zfree(tmp_ctx);
+    return res;
+}
+
+static void svc_child_info(struct mt_svc *svc, int wait_status)
+{
+    if (WIFEXITED(wait_status)) {
+        DEBUG(SSSDBG_OP_FAILURE,
+              "Child [%d] exited with code [%d]\n",
+               svc->pid, WEXITSTATUS(wait_status));
+    } else if (WIFSIGNALED(wait_status)) {
+        DEBUG(SSSDBG_OP_FAILURE,
+              "Child [%d] terminated with signal [%d]\n",
+               svc->pid, WTERMSIG(wait_status));
+    } else {
+        DEBUG(SSSDBG_FATAL_FAILURE,
+              "Child [%d] did not exit cleanly\n", svc->pid);
+        /* Forcibly kill this child, just in case */
+        kill(svc->pid, SIGKILL);
+
+        /* Let us get caught by another
+         * call to the SIGCHLD handler
+         */
+    }
+}
+
+static void svc_diag_cmd_exit_handler(int pid, int wait_status, void *pvt)
+{
+    struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);
+
+    svc_child_info(svc, wait_status);
+}
+
+static void svc_run_diag_cmd(struct mt_svc *svc)
+{
+    pid_t pkc_pid;
+    char **args;
+    int ret;
+    int debug_fd;
+    char *diag_cmd;
+    struct sss_child_ctx *diag_child_ctx;
+
+    if (svc->diag_cmd == NULL) {
+        return;
+    }
+
+    pkc_pid = fork();
+    if (pkc_pid != 0) {
+        /* parent, schedule SIGKILL */
+
+        ret = sss_child_register(svc,
+                                 svc->mt_ctx->sigchld_ctx,
+                                 pkc_pid,
+                                 svc_diag_cmd_exit_handler,
+                                 svc,
+                                 &diag_child_ctx);
+        if (ret != EOK) {
+            DEBUG(SSSDBG_CRIT_FAILURE, "Cannot register child %d\n", pkc_pid);
+            /* Try to go on ... */
+        }
+
+        return;
+    }
+
+    /* child, execute diagnostics */
+    diag_cmd = expand_diag_cmd(svc, svc->diag_cmd);
+    if (diag_cmd == NULL) {
+        DEBUG(SSSDBG_CRIT_FAILURE,
+              "Failed to expand [%s]\n", svc->diag_cmd);
+        _exit(1);
+    }
+
+    if (debug_level >= SSSDBG_TRACE_LIBS) {
+        debug_fd = get_fd_from_debug_file();
+        ret = dup2(debug_fd, STDERR_FILENO);
+        if (ret == -1) {
+            ret = errno;
+            DEBUG(SSSDBG_MINOR_FAILURE,
+                "dup2 failed for stderr [%d][%s].\n", ret, sss_strerror(ret));
+            /* failure to redirect stderr is not fatal */
+        }
+
+        ret = dup2(debug_fd, STDOUT_FILENO);
+        if (ret == -1) {
+            ret = errno;
+            DEBUG(SSSDBG_MINOR_FAILURE,
+                "dup2 failed for stdout [%d][%s].\n", ret, sss_strerror(ret));
+            /* failure to redirect stdout is not fatal */
+        }
+    }
+
+    args = parse_args(diag_cmd);
+    execvp(args[0], args);
+
+    /* If we are here, exec() has failed
+     * Print errno and abort quickly */
+    ret = errno;
+    DEBUG(SSSDBG_FATAL_FAILURE,
+          "Could not exec %s, reason: %s\n", svc->diag_cmd, strerror(ret));
+    _exit(1);
+}
+
  static int mark_service_as_started(struct mt_svc *svc)
  {
      struct mt_ctx *ctx = svc->mt_ctx;
@@ -613,8 +784,10 @@ static int monitor_kill_service (struct mt_svc *svc)
          return EOK;
      }

+    svc_run_diag_cmd(svc);
+
      /* Set up a timer to send SIGKILL if this process
-     * doesn't exit within sixty seconds
+     * doesn't exit within the configured interval
       */
      tv = tevent_timeval_current_ofs(svc->kill_time, 0);
      svc->kill_timer = tevent_add_timer(svc->mt_ctx->ev,
@@ -628,7 +801,6 @@ static int monitor_kill_service (struct mt_svc *svc)
                "Failed to allocate timed event: mt_svc_sigkill.\n");
          /* We'll just have to hope that the SIGTERM succeeds */
      }
-
      return EOK;
  }

@@ -1065,6 +1237,18 @@ static errno_t get_ping_config(struct mt_ctx *ctx, const 
char *path,
            "Time between service pings for [%s]: [%d]\n",
             svc->name, svc->ping_time);

+    ret = confdb_get_string(ctx->cdb, svc, path,
+                            CONFDB_MONITOR_PRE_KILL_CMD,
+                            NULL, &svc->diag_cmd);
+    if (ret != EOK) {
+        DEBUG(SSSDBG_CRIT_FAILURE,
+              "Failed to get diagnostics command for %s\n", svc->name);
+        return ret;
+    }
+    if (svc->diag_cmd) {
+        DEBUG(SSSDBG_CONF_SETTINGS, "Diagnostics command: [%s]\n", 
svc->diag_cmd);
There are 83 characters.

+    }
+
      ret = confdb_get_int(ctx->cdb, path,
                           CONFDB_SERVICE_FORCE_TIMEOUT,
                           MONITOR_DEF_FORCE_TIME, &svc->kill_time);
@@ -2615,6 +2799,10 @@ static void ping_check(DBusPendingCall *pending, void 
*data)
                     "Attempt [%d]\n",
                     svc->name, svc->failed_pongs);
              svc->failed_pongs++;
+
+            if (debug_level & SSSDBG_TRACE_LIBS) {
+                svc_run_diag_cmd(svc);
+            }
              break;
          }

@@ -2765,26 +2953,9 @@ static void mt_svc_exit_handler(int pid, int 
wait_status, void *pvt)
  {
      struct mt_svc *svc = talloc_get_type(pvt, struct mt_svc);

-
-    if (WIFEXITED(wait_status)) {
-        DEBUG(SSSDBG_OP_FAILURE,
-              "Child [%s] exited with code [%d]\n",
-               svc->name, WEXITSTATUS(wait_status));
-    } else if (WIFSIGNALED(wait_status)) {
-        DEBUG(SSSDBG_OP_FAILURE,
-              "Child [%s] terminated with signal [%d]\n",
-               svc->name, WTERMSIG(wait_status));
-    } else {
-        DEBUG(SSSDBG_FATAL_FAILURE,
-              "Child [%s] did not exit cleanly\n", svc->name);
-        /* Forcibly kill this child, just in case */
-        kill(svc->pid, SIGKILL);
-
-        /* Return and let us get caught by another
-         * call to the SIGCHLD handler
-         */
-        return;
-    }
+    DEBUG(SSSDBG_TRACE_LIBS,
+          "SIGCHLD handler of service %s called\n", svc->name);
+    svc_child_info(svc, wait_status);

      /* Clear the kill_timer so we don't try to SIGKILL it after it's
       * already gone.
-- 2.4.3
_______________________________________________
sssd-devel mailing list
sssd-devel@lists.fedorahosted.org
https://lists.fedorahosted.org/mailman/listinfo/sssd-devel

Reply via email to