Source: collectd
Source-Version: 5.5.1-1
Severity: wishlist
Tags: patch upstream
Forwarded: https://github.com/collectd/collectd/pull/1644

Hi!

We have noticed in some cases that collectd gets stuck in its process
loop, and does not process the loop variable set by the SIGTERM
handler. While we'll try to track that down eventually, being unable
to stop or restart the daemon is a very annoying issue for us.

Attached are two patches that changes the start-stop-daemon invocation
to use its scheduling option, so that it eventually sends a SIGKILL if
the SIGTERM has had no effect. In addition when using collectdmon we
need to send some other signal to it because SIGKILL is uncatchable.
The upstream pull request implements that second attached patch.

Thanks,
Guillem
From ecab4c5814699d9fd104e3b6d316056acbf03bda Mon Sep 17 00:00:00 2001
From: Guillem Jover <[email protected]>
Date: Fri, 1 Apr 2016 16:47:49 +0200
Subject: [PATCH] collectdmon: Add support for sending SIGKILL to collectd

Sometimes we might want to force termination of the collectd daemon,
but we do not have visibility over the PID of the collectd daemon. So
we need to request collectdmon to terminate the process for us, but
we cannot send SIGKILL directly as it's an uncatchable signal. Instead
we use SIGQUIT, catch that and then send SIGKILL to our child.
---
 src/collectdmon.c   | 28 ++++++++++++++++++++++++----
 src/collectdmon.pod |  5 +++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/collectdmon.c b/src/collectdmon.c
index 65271dd..096b368 100644
--- a/src/collectdmon.c
+++ b/src/collectdmon.c
@@ -64,6 +64,7 @@
 #endif /* ! WCOREDUMP */
 
 static int loop    = 0;
+static int quit    = 0;
 static int restart = 0;
 
 static char  *pidfile      = NULL;
@@ -199,12 +200,12 @@ static int collectd_start (char **argv)
 	exit (-1);
 } /* collectd_start */
 
-static int collectd_stop (void)
+static int collectd_stop (int signo)
 {
 	if (0 == collectd_pid)
 		return 0;
 
-	if (0 != kill (collectd_pid, SIGTERM)) {
+	if (0 != kill (collectd_pid, signo)) {
 		syslog (LOG_ERR, "Error: kill() failed: %s", strerror (errno));
 		return -1;
 	}
@@ -217,6 +218,13 @@ static void sig_int_term_handler (int __attribute__((unused)) signo)
 	return;
 } /* sig_int_term_handler */
 
+static void sig_quit_handler (int __attribute__((unused)) signo)
+{
+	++quit;
+	++loop;
+	return;
+} /* sig_quit_handler */
+
 static void sig_hup_handler (int __attribute__((unused)) signo)
 {
 	++restart;
@@ -343,6 +351,14 @@ int main (int argc, char **argv)
 		return 1;
 	}
 
+	sa.sa_handler = sig_quit_handler;
+
+	if (0 != sigaction (SIGQUIT, &sa, NULL)) {
+		syslog (LOG_ERR, "Error: sigaction() failed: %s", strerror (errno));
+		free (collectd_argv);
+		return 1;
+	}
+
 	sa.sa_handler = sig_hup_handler;
 
 	if (0 != sigaction (SIGHUP, &sa, NULL)) {
@@ -362,8 +378,12 @@ int main (int argc, char **argv)
 		assert (0 < collectd_pid);
 		while ((collectd_pid != waitpid (collectd_pid, &status, 0))
 				&& (EINTR == errno))
-			if ((0 != loop) || (0 != restart))
-				collectd_stop ();
+			if ((0 != loop) || (0 != restart)) {
+				if (quit)
+					collectd_stop (SIGKILL);
+				else
+					collectd_stop (SIGTERM);
+			}
 
 		collectd_pid = 0;
 
diff --git a/src/collectdmon.pod b/src/collectdmon.pod
index 8fa62f3..6cd05e2 100644
--- a/src/collectdmon.pod
+++ b/src/collectdmon.pod
@@ -57,6 +57,11 @@ termination and then shut down.
 This signal causes B<collectdmon> to terminate B<collectd>, wait for its
 termination and then restart it.
 
+=item B<SIGQUIT>
+
+This signal causes B<collectdmon> to terminate B<collectd> with B<SIGKILL>,
+wait for its termination and then shut down.
+
 =back
 
 =head1 SEE ALSO
-- 
2.8.0.rc3.226.g39d4020

From a4e763799acb86573a5712dd86376f0a982d3c17 Mon Sep 17 00:00:00 2001
From: Guillem Jover <[email protected]>
Date: Fri, 1 Apr 2016 17:08:13 +0200
Subject: [PATCH] debian: Use start-stop-daemon to schedule a SIGKILL in case
 SIGTERM has no effect

When using the collectdmon daemon we send instead a SIGQUIT which
collectdmon will catch and forward to collectd as a SIGKILL.
---
 debian/collectd-core.collectd.init.d | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/debian/collectd-core.collectd.init.d b/debian/collectd-core.collectd.init.d
index f18b994..7778d6f 100755
--- a/debian/collectd-core.collectd.init.d
+++ b/debian/collectd-core.collectd.init.d
@@ -112,31 +112,19 @@ the disk. You can adjust the waiting time in /etc/default/collectd."
 #   1 if the daemon was already stopped
 #   2 if daemon could not be stopped
 d_stop() {
-	PID=$( cat "$PIDFILE" 2> /dev/null ) || true
-
-	start-stop-daemon --stop --quiet --oknodo --pidfile "$PIDFILE"
+	if test "$USE_COLLECTDMON" == 1; then
+		start-stop-daemon --stop --quiet --oknodo --pidfile "$PIDFILE" \
+			--retry TERM/$MAXWAIT/QUIT/5
+	else
+		start-stop-daemon --stop --quiet --oknodo --pidfile "$PIDFILE" \
+			--retry $MAXWAIT
+	fi
 	rc="$?"
 
 	if test "$rc" -eq 2; then
-		return 2
+		log_progress_msg "$still_running_warning"
 	fi
 
-	sleep 1
-	if test -n "$PID" && kill -0 $PID 2> /dev/null; then
-		i=0
-		while kill -0 $PID 2> /dev/null; do
-			i=$(( $i + 2 ))
-			echo -n " ."
-
-			if test $i -gt $MAXWAIT; then
-				log_progress_msg "$still_running_warning"
-				return 2
-			fi
-
-			sleep 2
-		done
-		return "$rc"
-	fi
 	return "$rc"
 }
 
-- 
2.8.0.rc3.226.g39d4020

Reply via email to