Allow relayd to clean-up objects related to a dead connection
for which the FIN packet was no emitted (Unexpected shutdown,
ethernet blocking). Note that an idle peer is not considered dead given
that it respond to the keep-alive query after the idle time is elapsed.

By RFC 1122-4.2.3.6 implementation must default to no less than two
hours for the idle period. On linux the default value is indeed 2 hours.
This could be problematic if relayd should be aggressive regarding
dead-peers. Hence it is important to provide tuning knob regarding the
tcp keep-alive mechanism.

The following environments variable can be used to enable and fine-tune
it:
    LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE
        Set to 1 to enable the use of tcp keep-alive allowing the detection
        of dead peers.

    LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME
        See tcp(7) tcp_keepalive_time or tcp_keepalive_interval on
        Solaris 11.
        A value of -1 lets the operating system manage this parameter
        (default).

    LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES
        See tcp(7) tcp_keepalive_probes.
        A value of -1 lets the operating system manage this
        parameter (default).
        No effect on Solaris.

    LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL`::
        See tcp(7) tcp_keepalive_intvl.
        A value of -1 lets the operating system manage
        his parameter (default).

Signed-off-by: Jonathan Rajotte <[email protected]>
---
 src/bin/lttng-relayd/Makefile.am      |   3 +-
 src/bin/lttng-relayd/main.c           |  14 ++
 src/bin/lttng-relayd/tcp_keep_alive.c | 243 ++++++++++++++++++++++++++++++++++
 src/bin/lttng-relayd/tcp_keep_alive.h |  25 ++++
 4 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 src/bin/lttng-relayd/tcp_keep_alive.c
 create mode 100644 src/bin/lttng-relayd/tcp_keep_alive.h

diff --git a/src/bin/lttng-relayd/Makefile.am b/src/bin/lttng-relayd/Makefile.am
index c7dd37e1..5125c721 100644
--- a/src/bin/lttng-relayd/Makefile.am
+++ b/src/bin/lttng-relayd/Makefile.am
@@ -21,7 +21,8 @@ lttng_relayd_SOURCES = main.c lttng-relayd.h utils.h utils.c 
cmd.h \
                        stream-fd.c stream-fd.h \
                        connection.c connection.h \
                        viewer-session.c viewer-session.h \
-                       tracefile-array.c tracefile-array.h
+                       tracefile-array.c tracefile-array.h \
+                       tcp_keep_alive.c tcp_keep_alive.h
 
 # link on liblttngctl for check if relayd is already alive.
 lttng_relayd_LDADD = -lurcu-common -lurcu \
diff --git a/src/bin/lttng-relayd/main.c b/src/bin/lttng-relayd/main.c
index 0eb8e282..b43743a5 100644
--- a/src/bin/lttng-relayd/main.c
+++ b/src/bin/lttng-relayd/main.c
@@ -70,6 +70,7 @@
 #include "stream.h"
 #include "connection.h"
 #include "tracefile-array.h"
+#include "tcp_keep_alive.h"
 
 static const char *help_msg =
 #ifdef LTTNG_EMBED_HELP
@@ -899,6 +900,14 @@ restart:
                                        lttcomm_destroy_sock(newsock);
                                        goto error;
                                }
+
+                               ret = tcp_keepalive_setsockopt(newsock->fd);
+                               if (ret < 0) {
+                                       PERROR("setsockopt tcp_keep_alive");
+                                       lttcomm_destroy_sock(newsock);
+                                       goto error;
+                               }
+
                                new_conn = connection_create(newsock, type);
                                if (!new_conn) {
                                        lttcomm_destroy_sock(newsock);
@@ -2755,6 +2764,11 @@ int main(int argc, char **argv)
                goto exit_options;
        }
 
+       if (tcp_keepalive_get_settings()) {
+               retval = -1;
+               goto exit_options;
+       }
+
        if (set_signal_handler()) {
                retval = -1;
                goto exit_options;
diff --git a/src/bin/lttng-relayd/tcp_keep_alive.c 
b/src/bin/lttng-relayd/tcp_keep_alive.c
new file mode 100644
index 00000000..d61802d6
--- /dev/null
+++ b/src/bin/lttng-relayd/tcp_keep_alive.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2017 - Jonathan Rajotte <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <sys/types.h>
+#include <netinet/tcp.h>
+#include <stdbool.h>
+#include <sys/socket.h>
+
+#include <common/compat/getenv.h>
+
+#include "tcp_keep_alive.h"
+
+#define LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE_ENV 
"LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE"
+#define LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV "LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME"
+#define LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV 
"LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES"
+#define LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV 
"LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL"
+
+#ifdef __sun
+#define COMPAT_TCP_KEEPIDLE TCP_KEEPALIVE_THRESHOLD
+#define SOL_TCP IPPROTO_TCP /* Solaris does not support SOL_TCP */
+#else
+#define COMPAT_TCP_KEEPIDLE TCP_KEEPIDLE
+#endif /* __sun */
+
+static bool tcp_keepalive_enabled = false;
+static int tcp_keepalive_time = -1;
+static int tcp_keepalive_intvl = -1;
+static int tcp_keepalive_probes = -1;
+
+static bool tcp_keepalive_time_valid(int value)
+{
+#ifndef __sun
+       return true;
+#else
+#ifdef TCP_KEEPALIVE_THRESHOLD
+       /*
+        * Minimum 10s , maximum 10 days. Defined by
+        * 
https://docs.oracle.com/cd/E23824_01/html/821-1475/tcp-7p.html#REFMAN7tcp-7p
+        */
+       if (value < 10 || value > 864000) {
+               return false;
+       }
+       return true;
+#else
+       WARN("Solaris 10 does not support local override of the 
TCP_KEEP_ALIVE_THRESHOLD. " LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV);
+       return false;
+#endif /* TCP_KEEPALIVE_THRESHOLD */
+#endif /* __sun */
+}
+
+int tcp_keepalive_get_settings(void)
+{
+       int ret;
+       const char *value;
+
+       value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_ENABLE_ENV);
+       if (value && !strcmp(value, "1")) {
+               tcp_keepalive_enabled = true;
+       } else {
+               tcp_keepalive_enabled = false;
+               ret = 0;
+               goto disabled;
+       }
+
+       /* Get value for tcp_keepalive_time in seconds*/
+       value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_TIME_ENV);
+       if (value) {
+               int tmp;
+               errno = 0;
+               tmp = (int) strtol(value, NULL, 0);
+               if (errno != 0 || tmp < -1) {
+                       PERROR("TCP_KEEP_ALIVE time parse");
+                       ret = 1;
+                       goto error;
+               } else {
+                       if (!tcp_keepalive_time_valid(tmp)) {
+                               ERR("TCP_KEEP_ALIVE time invalid value");
+                               ret = 1;
+                               goto error;
+                       }
+#ifdef __sun
+                       /*
+                        * Under solaris this value is expressed in
+                        * milliseconds. Fits in a int.
+                        */
+                       tmp = tmp * 1000;
+#endif /* ifdef __sun */
+                       tcp_keepalive_time = tmp;
+               }
+       }
+
+
+       /* Get value for tcp_keepalive_intvl in seconds */
+       value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV);
+       if (value) {
+#ifdef __sun
+               WARN("Solaris does not support local override of 
tcp_keepalive_intvl. " LTTNG_RELAYD_TCP_KEEP_ALIVE_INTVL_ENV);
+               ret = 1;
+               goto error;
+#else
+               int tmp;
+               errno = 0;
+               tmp = (int) strtol(value, NULL, 0);
+               if (errno != 0 || tmp < -1) {
+                       PERROR("TCP_KEEP_ALIVE interval parse");
+                       ret = 1;
+                       goto error;
+               } else {
+                       if (tmp > 0) {
+                               tcp_keepalive_intvl = tmp;
+                       }
+               }
+#endif /* __sun */
+       }
+
+       /* Get value for tcp_keepalive_probes */
+       value = lttng_secure_getenv(LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV);
+       if (value) {
+#ifdef __sun
+               WARN("Solaris does not support local override of 
tcp_keepalive_probes. " LTTNG_RELAYD_TCP_KEEP_ALIVE_PROBES_ENV);
+               ret = 1;
+               goto error;
+#else
+               int tmp;
+               errno = 0;
+               tmp = (int) strtol(value, NULL, 0);
+               if (errno != 0 || tmp < -1) {
+                       PERROR("TCP_KEEP_ALIVE probes parse");
+                       ret = 1;
+                       goto error;
+               } else {
+                       if (tmp > 0) {
+                               tcp_keepalive_probes = tmp;
+                       }
+               }
+#endif /* __sun */
+       }
+
+       DBG("TCP_KEEP_ALIVE enabled");
+       if (tcp_keepalive_time > -1) {
+               DBG("Overwrite tcp_keepalive_time to %d", tcp_keepalive_time);
+       }
+       if (tcp_keepalive_intvl > -1) {
+               DBG("Overwrite tcp_keepalive_intvl to %d", tcp_keepalive_intvl);
+       }
+       if (tcp_keepalive_probes > -1) {
+               DBG("Overwrite tcp_keepalive_time to %d", tcp_keepalive_probes);
+       }
+       ret = 0;
+
+error:
+disabled:
+       return ret;
+}
+
+/*
+ * Set the socket options regarding tcp_keepalive.
+ */
+int tcp_keepalive_setsockopt(int fd)
+{
+       int ret;
+       int val = 1;
+       int optval;
+       socklen_t optlen = sizeof(optval);
+
+       if (!tcp_keepalive_enabled) {
+               ret = 0;
+               goto end;
+       }
+
+       ret = setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val,
+                       sizeof(val));
+       if (ret < 0) {
+               PERROR("setsockopt so_keepalive");
+               goto end;
+       }
+
+       /* Check the status for the keepalive option */
+       if(getsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, &optlen) < 0) {
+               perror("getsockopt()");
+       }
+       WARN("SO_KEEPALIVE is %s", (optval ? "ON" : "OFF"));
+
+#if !defined(__sun) || (defined(__sun) && defined(TCP_KEEPALIVE_THRESHOLD))
+       if (tcp_keepalive_time > -1) {
+               ret = setsockopt(fd, SOL_TCP, COMPAT_TCP_KEEPIDLE, 
&tcp_keepalive_time,
+                               sizeof(tcp_keepalive_time));
+               if (ret < 0) {
+                       PERROR("setsockopt TCP_KEEPIDLE");
+                       goto end;
+               }
+               /* Check the status for the keepalive option */
+               if(getsockopt(fd, SOL_TCP, COMPAT_TCP_KEEPIDLE, &optval, 
&optlen) < 0) {
+                       perror("getsockopt()");
+               }
+               WARN("SO_ keep_idle value is %d", optval);
+       }
+#endif /* ! defined(__sun) || (defined(__sun) && 
defined(TCP_KEEPALIVE_THRESHOLD)) */
+
+       /* Sun does not support either tcp_keepalive_probes or
+        * tcp_keepalive_intvl. Inferring a value for
+        * TCP_KEEPALIVE_ABORT_THRESHOLD doing
+        * tcp_keepalive_probes * tcp_keepalive_intvl could yield a good
+        * alternative but Solaris does not detail the algorithm used (constant
+        * time retry like linux or somthing fancier). So simply ignore those
+        * setting on solaris for now.
+        */
+#ifndef __sun
+       if (tcp_keepalive_intvl > -1) {
+               ret = setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, 
&tcp_keepalive_intvl,
+                               sizeof(tcp_keepalive_intvl));
+               if (ret < 0) {
+                       PERROR("setsockopt TCP_KEEPINTVL");
+                       goto end;
+               }
+       }
+
+       if (tcp_keepalive_probes > -1) {
+               ret = setsockopt(fd, SOL_TCP, TCP_KEEPCNT, 
&tcp_keepalive_probes,
+                               sizeof(tcp_keepalive_probes));
+               if (ret < 0) {
+                       PERROR("setsockopt TCP_KEEPCNT");
+                       goto end;
+               }
+       }
+#endif /* __sun */
+end:
+       return ret;
+}
diff --git a/src/bin/lttng-relayd/tcp_keep_alive.h 
b/src/bin/lttng-relayd/tcp_keep_alive.h
new file mode 100644
index 00000000..cd8025a0
--- /dev/null
+++ b/src/bin/lttng-relayd/tcp_keep_alive.h
@@ -0,0 +1,25 @@
+#ifndef RELAYD_TCP_KEEP_ALIVE_H
+#define RELAYD_TCP_KEEP_ALIVE_H
+
+/*
+ * Copyright (C) 2017 - Jonathan Rajotte <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+int tcp_keepalive_get_settings(void);
+int tcp_keepalive_setsockopt(int socket_fd);
+
+#endif /* RELAYD_UTILS_H */
-- 
2.11.0

_______________________________________________
lttng-dev mailing list
[email protected]
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Reply via email to